From 3cb975806ad4a14243071d6cc9379f3c3ee3d211 Mon Sep 17 00:00:00 2001 From: qaate47 Date: Thu, 15 Sep 2022 16:46:39 +0200 Subject: [PATCH 1/9] Implement HDTCatTree to create an HDT with low resources using HDTCat --- .../java/org/rdfhdt/hdt/hdt/HDTManager.java | 59 +++++ .../java/org/rdfhdt/hdt/hdt/HDTSupplier.java | 42 ++++ .../java/org/rdfhdt/hdt/rdf/RDFFluxStop.java | 102 +++++++++ .../java/org/rdfhdt/hdt/tools/RDF2HDT.java | 49 ++++- .../impl/FourSectionDictionaryCat.java | 39 +++- .../impl/MultipleSectionDictionaryCat.java | 44 +++- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 127 ++++++++++- .../java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java | 40 ++-- .../org/rdfhdt/hdt/header/PlainHeader.java | 2 +- .../utils/FluxStopTripleStringIterator.java | 65 ++++++ .../hdt/iterator/utils/MapIterator.java | 44 ++++ .../hdt/iterator/utils/PipedCopyIterator.java | 198 +++++++++++++++++ .../org/rdfhdt/hdt/rdf/RDFParserFactory.java | 37 ++++ .../impl/BitmapTriplesIteratorCat.java | 5 +- .../hdt/util/concurrent/ExceptionThread.java | 198 +++++++++++++++++ .../hdt/util/listener/PrefixListener.java | 42 ++++ .../org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java | 208 ++++++++++++++++++ .../util/LargeFakeDataSetStreamSupplier.java | 81 ++++++- 18 files changed, 1328 insertions(+), 54 deletions(-) create mode 100644 hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java create mode 100644 hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FluxStopTripleStringIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixListener.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java index d3f33f70..da45017b 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java @@ -10,6 +10,7 @@ import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.rdf.RDFFluxStop; import org.rdfhdt.hdt.rdf.TripleWriter; import org.rdfhdt.hdt.triples.TripleString; @@ -339,6 +340,61 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB return HDTManager.getInstance().doHDTDiffBit(location, hdtFileName, deleteBitmap, hdtFormat, listener); } + + /** + * Create an HDT file from an RDF file in a tree, stop the chunk creation with the fluxStop + * + * @param fluxStop Flux stopper + * @param supplier HDT supplier to create initial HDT before cat + * @param rdfFileName File name. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * + * @throws IOException when the file cannot be found + * @throws ParserException when the file cannot be parsed + * @return HDT + */ + public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, rdfFileName, baseURI, rdfNotation, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream, stop the chunk creation with the fluxStop + * + * @param fluxStop Flux stopper + * @param supplier HDT supplier to create initial HDT before cat + * @param rdfStream Stream. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * + * @throws IOException when the file cannot be found + * @throws ParserException when the file cannot be parsed + * @return HDT + */ + public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream rdfStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, rdfStream, baseURI, rdfNotation, hdtFormat, listener); + } + + /** + * Create an HDT from an RDF iterator, stop the chunk creation with the fluxStop + * + * @param fluxStop Flux stopper + * @param supplier HDT supplier to create initial HDT before cat + * @param iterator A provider of triples. Must implement hasNext(), next() and estimatedNumResults. + * @param baseURI Base URI for the dataset. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @throws IOException when the file cannot be found + * @throws ParserException when the file cannot be parsed + * @return HDT + */ + public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, iterator, baseURI, hdtFormat, listener); + } + // Abstract methods for the current implementation protected abstract HDTOptions doReadOptions(String file) throws IOException; protected abstract HDT doLoadHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException; @@ -355,5 +411,8 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException; protected abstract HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException; protected abstract HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBitmap, HDTOptions hdtFormat, ProgressListener listener) throws IOException; + protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, String filename, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; + protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream stream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; + protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java new file mode 100644 index 00000000..7ad21ace --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java @@ -0,0 +1,42 @@ +package org.rdfhdt.hdt.hdt; + +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.triples.TripleString; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Iterator; + +/** + * Interface describing an HDT generator method + * + * @author Antoine Willerval + */ +@FunctionalInterface +public interface HDTSupplier { + /** + * @return implementation using in-memory hdt + */ + static HDTSupplier memory() { + return (iterator, baseURI, hdtFormat, listener, location) -> { + try (HDT hdt = HDTManager.generateHDT(iterator, baseURI, hdtFormat, listener)) { + hdt.saveToHDT(location.toAbsolutePath().toString(), listener); + } + }; + } + + /** + * Generate the HDT + * + * @param iterator the iterator to create the hdt + * @param baseURI the base URI (useless, but asked by some methods) + * @param hdtFormat the HDT options to create the HDT + * @param listener listener + * @param location where to write the HDT + * @throws IOException io exception while creating the HDT + * @throws ParserException parser exception while retrieving the triples + */ + void doGenerateHDT(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener, Path location) throws IOException, ParserException; +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java new file mode 100644 index 00000000..dc4f5de5 --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java @@ -0,0 +1,102 @@ +package org.rdfhdt.hdt.rdf; + +import org.rdfhdt.hdt.triples.TripleString; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +/** + * Rdf flux stopper descriptor + * @author Antoine Willerval + */ +public interface RDFFluxStop { + /** + * @return basic implementation without any limit + */ + static RDFFluxStop noLimit() { + return new RDFFluxStop() { + @Override + public boolean canHandle(TripleString ts) { + return true; + } + + @Override + public void restart() { + // nothing + } + }; + } + + /** + * implementation of flux stop stopping after a maximum triple count + * + * @param maxTriple maximum count + * @return FluxStop + */ + static RDFFluxStop countLimit(long maxTriple) { + if (maxTriple <= 0) { + throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); + } + return new RDFFluxStop() { + long current = 0; + + @Override + public boolean canHandle(TripleString ts) { + return current++ < maxTriple; + } + + @Override + public void restart() { + current = 0; + } + }; + } + + /** + * implementation of flux stop stopping after a maximum NTriple size + * + * @param maxSize maximum size + * @return FluxStop + */ + static RDFFluxStop sizeLimit(long maxSize) { + if (maxSize <= 0) { + throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); + } + return new RDFFluxStop() { + long size = 0; + + @Override + public boolean canHandle(TripleString ts) { + long tsSize; + try { + tsSize = ts.asNtriple().toString().getBytes(StandardCharsets.UTF_8).length; + } catch (IOException e) { + throw new RuntimeException("Can't estimate the size of the triple " + ts, e); + } + try { + return size < maxSize; + } finally { + size += tsSize; + } + } + + @Override + public void restart() { + size = 0; + } + }; + } + + /** + * should we stop the flux after this triple or not? + * + * @param ts the triple + * @return true if the flux can handle this triple, false otherwise + */ + boolean canHandle(TripleString ts); + + /** + * restart the flux stop + */ + void restart(); +} diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index 494235f9..b330033b 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -33,20 +33,30 @@ import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdt.HDTSupplier; import org.rdfhdt.hdt.hdt.HDTVersion; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.rdf.RDFFluxStop; import org.rdfhdt.hdt.util.StopWatch; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.internal.Lists; +import org.rdfhdt.hdt.util.StringUtil; /** * @author mario.arias * */ public class RDF2HDT implements ProgressListener { + /** + * @return a theoretical maximum amount of memory the JVM will attempt to use + */ + private static long getMaxTreeCatChunkSize() { + Runtime runtime = Runtime.getRuntime(); + return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) / (0.85 * 5)); + } public String rdfInput; public String hdtOutput; @@ -77,7 +87,13 @@ public class RDF2HDT implements ProgressListener { @Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples") public boolean ntSimpleLoading; - + + @Parameter(names = "-cattree", description = "Use HDTCatTree to split the HDT creation for big dataset") + public boolean catTree; + + @Parameter(names = "-cattreelocation", description = "Only with -cattree, set the tree building location") + public String catTreeLocation; + public void execute() throws ParserException, IOException { HDTSpecification spec; if(configFile!=null) { @@ -115,7 +131,30 @@ public void execute() throws ParserException, IOException { } StopWatch sw = new StopWatch(); - HDT hdt = HDTManager.generateHDT(rdfInput, baseURI,notation , spec, this); + HDT hdt; + + if (catTree) { + if (catTreeLocation != null) { + spec.set("loader.cattree.location", catTreeLocation); + } + spec.set("loader.cattree.futureHDTLocation", hdtOutput); + + long maxTreeCatChunkSize = getMaxTreeCatChunkSize(); + + System.out.println("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true)); + + hdt = HDTManager.catTree( + RDFFluxStop.sizeLimit(maxTreeCatChunkSize), + HDTSupplier.memory(), + rdfInput, + baseURI, + notation, + spec, + this + ); + } else { + hdt = HDTManager.generateHDT(rdfInput, baseURI, notation , spec, this); + } System.out.println("File converted in: "+sw.stopAndShow()); try { @@ -130,7 +169,11 @@ public void execute() throws ParserException, IOException { // Dump to HDT file sw = new StopWatch(); - hdt.saveToHDT(hdtOutput, this); + + if (!catTree) { + // ignore catTree save because the file is already here + hdt.saveToHDT(hdtOutput, this); + } System.out.println("HDT saved to file in: "+sw.stopAndShow()); // Generate index and dump it to .hdt.index file diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java index 2a8f4ef2..32c12bca 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java @@ -29,6 +29,7 @@ import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.PrefixListener; import java.io.*; import java.nio.file.Files; @@ -59,7 +60,13 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener allMappings.put("SH1",new CatMapping(location,"SH1",dictionary1.getShared().getNumberOfElements())); allMappings.put("SH2",new CatMapping(location,"SH2",dictionary2.getShared().getNumberOfElements())); - System.out.println("PREDICATES-------------------"); +// System.out.println("PREDICATES-------------------"); + ProgressListener iListener; + + iListener = PrefixListener.of("Generate predicates: ", listener); + if (iListener != null) { + iListener.notifyProgress(0, "start"); + } int numCommonPredicates = 0; @@ -67,7 +74,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener while (commonP1P2.hasNext()){ commonP1P2.next(); numCommonPredicates++; - //ListenerUtil.notifyCond(listener, "Analyze common predicates", numCommonPredicates, numCommonPredicates, maxPredicates); + //ListenerUtil.notifyCond(iListener, "Analyze common predicates", numCommonPredicates, numCommonPredicates, maxPredicates); } long numPredicates = dictionary1.getPredicates().getNumberOfElements()+dictionary2.getPredicates().getNumberOfElements()-numCommonPredicates; @@ -75,8 +82,12 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener addPredicatesList.add(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1")); addPredicatesList.add(new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); CatUnion itAddPredicates = new CatUnion(addPredicatesList); - SectionUtil.createSection(location,numPredicates, 4,itAddPredicates, new CatUnion(new ArrayList<>()),allMappings,0, listener); - System.out.println("SUBJECTS-------------------"); + SectionUtil.createSection(location,numPredicates, 4,itAddPredicates, new CatUnion(new ArrayList<>()),allMappings,0, iListener); +// System.out.println("SUBJECTS-------------------"); + iListener = PrefixListener.of("Generate subjects: ", listener); + if (iListener != null) { + iListener.notifyProgress(0, "start"); + } ArrayList> skipSubjectList = new ArrayList<>(); skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); @@ -110,9 +121,13 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener addSubjectsList.add(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2")); CatUnion itAddSubjects = new CatUnion(addSubjectsList); - SectionUtil.createSection(location,numSubjects, 2,itAddSubjects,skipSubject ,allMappings,0, listener); + SectionUtil.createSection(location,numSubjects, 2,itAddSubjects,skipSubject ,allMappings,0, iListener); - System.out.println("OBJECTS-------------------"); +// System.out.println("OBJECTS-------------------"); + iListener = PrefixListener.of("Generate objects: ", listener); + if (iListener != null) { + iListener.notifyProgress(0, "start"); + } ArrayList> skipObjectsList = new ArrayList<>(); skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"),new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"))); @@ -148,9 +163,13 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener addObjectsList.add(new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2")); CatUnion itAddObjects = new CatUnion(addObjectsList); - SectionUtil.createSection(location,numObject, 3,itAddObjects,skipObject ,allMappings,0, listener); + SectionUtil.createSection(location,numObject, 3,itAddObjects,skipObject ,allMappings,0, iListener); - System.out.println("SHARED-------------------"); +// System.out.println("SHARED-------------------"); + iListener = PrefixListener.of("Generate shared: ", listener); + if (iListener != null) { + iListener.notifyProgress(0, "start"); + } CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"), new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2")); int numCommonS1O2=0; while (i2.hasNext()){ @@ -184,7 +203,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); CatUnion itAddShared = new CatUnion(addSharedList); - SectionUtil.createSection(location,numShared, 1,itAddShared,new CatUnion(new ArrayList<>()) ,allMappings,0, listener); + SectionUtil.createSection(location,numShared, 1,itAddShared,new CatUnion(new ArrayList<>()) ,allMappings,0, iListener); //Putting the sections together @@ -254,4 +273,4 @@ public HashMap getAllMappings() { public long getNumShared() { return numShared; } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java index 5a814e54..5fd5c9f5 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java @@ -35,6 +35,7 @@ import org.rdfhdt.hdt.util.crc.CRCOutputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.ListenerUtil; +import org.rdfhdt.hdt.util.listener.PrefixListener; import org.rdfhdt.hdt.util.string.ByteStringUtil; import java.io.*; @@ -97,8 +98,14 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener countSubSections2++; } + ProgressListener iListener; + +// System.out.println("PREDICATES-------------------"); + iListener = PrefixListener.of("Generate predicates: ", listener); + if (iListener != null) { + iListener.notifyProgress(0, "start"); + } - System.out.println("PREDICATES-------------------"); int numCommonPredicates = 0; @@ -114,10 +121,15 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener addPredicatesList.add(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1")); addPredicatesList.add(new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); CatUnion itAddPredicates = new CatUnion(addPredicatesList); - catSection(numPredicates, 3,itAddPredicates, new CatUnion(new ArrayList<>()),allMappings, listener); + catSection(numPredicates, 3,itAddPredicates, new CatUnion(new ArrayList<>()),allMappings, iListener); - System.out.println("SUBJECTS-------------------"); +// System.out.println("SUBJECTS-------------------"); + iListener = PrefixListener.of("Generate subjects: ", listener); + if (iListener != null) { + iListener.notifyProgress(0, "start"); + } + ArrayList> skipSubjectList = new ArrayList<>(); skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"), @@ -164,9 +176,14 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener addSubjectsList.add(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2")); CatUnion itAddSubjects = new CatUnion(addSubjectsList); - catSection(numSubjects, 2,itAddSubjects,skipSubject ,allMappings, listener); + catSection(numSubjects, 2,itAddSubjects,skipSubject ,allMappings, iListener); + +// System.out.println("OBJECTS-------------------"); + iListener = PrefixListener.of("Generate objects: ", listener); + if (iListener != null) { + iListener.notifyProgress(0, "start"); + } - System.out.println("OBJECTS-------------------"); ArrayList> skipObjectsList = new ArrayList<>(); if(dictionary1.getAllObjects().containsKey(NO_DT_OBJECTS)) { skipObjectsList.add(new CatIntersection( @@ -406,10 +423,10 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener // subtract the number of objects to be skipped - if creating do data type section if(dataType.equals(NO_DT_OBJECTS)) { numberElts -= numSkipObjects; - catSection(numberElts, type, itAddObjects, skipObject, allMappings, listener); + catSection(numberElts, type, itAddObjects, skipObject, allMappings, iListener); } else // if catting literals sections .. nothing will move (nothing to be skipped) - catSection(numberElts,type,itAddObjects,new CatUnion(new ArrayList<>()),allMappings,listener); + catSection(numberElts,type,itAddObjects,new CatUnion(new ArrayList<>()),allMappings,iListener); if(numberElts > 0 ) { dataTypes.add(dataType); offsets.put(dataType, total); @@ -417,7 +434,12 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener total+=numberElts; type++; } - System.out.println("SHARED-------------------"); +// System.out.println("SHARED-------------------"); + iListener = PrefixListener.of("Generate shared: ", listener); + if (iListener != null) { + iListener.notifyProgress(0, "start"); + } + int numCommonS1O2 = 0; if(dictionary2.getAllObjects().containsKey(NO_DT_OBJECTS)) { CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(), "S1"), new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "2")); @@ -459,7 +481,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); CatUnion itAddShared = new CatUnion(addSharedList); - catSection(numShared, 1,itAddShared,new CatUnion(new ArrayList<>()) ,allMappings, listener); + catSection(numShared, 1,itAddShared,new CatUnion(new ArrayList<>()) ,allMappings, iListener); //Putting the sections together @@ -475,7 +497,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener outFinal.write(dataTypes.size()); for(String datatype:dataTypes){ outFinal.write(datatype.length()); - IOUtil.writeBuffer(outFinal, datatype.getBytes(), 0, datatype.getBytes().length, listener); + IOUtil.writeBuffer(outFinal, datatype.getBytes(), 0, datatype.getBytes().length, iListener); } } Files.copy(Path.of(location + "section" + i), outFinal); @@ -687,4 +709,4 @@ public HashMap getAllMappings() { public long getNumShared() { return numShared; } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index ec06a859..7dd0be6c 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -10,17 +10,28 @@ import org.rdfhdt.hdt.hdt.impl.TempHDTImporterTwoPass; import org.rdfhdt.hdt.hdt.writer.TripleWriterHDT; import org.rdfhdt.hdt.header.HeaderUtil; +import org.rdfhdt.hdt.iterator.utils.FluxStopTripleStringIterator; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.rdf.RDFFluxStop; +import org.rdfhdt.hdt.rdf.RDFParserCallback; +import org.rdfhdt.hdt.rdf.RDFParserFactory; import org.rdfhdt.hdt.rdf.TripleWriter; import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.PrefixListener; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; +import java.util.Optional; public class HDTManagerImpl extends HDTManager { @@ -187,4 +198,118 @@ protected HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBit return hdt; } } -} \ No newline at end of file + + @Override + protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, String filename, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + try (InputStream is = IOUtil.getFileInputStream(filename)) { + return doHDTCatTree(fluxStop, supplier, is, baseURI, rdfNotation, hdtFormat, listener); + } + } + + @Override + protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream stream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation, useSimple(hdtFormat)); + Iterator iterator = RDFParserFactory.readAsIterator(parser, stream, baseURI, true, rdfNotation); + return doHDTCatTree(fluxStop, supplier, iterator, baseURI, hdtFormat, listener); + } + + @Override + protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + Path basePath; + String baseNameOpt = hdtFormat.get("loader.cattree.location"); + + if (baseNameOpt == null || baseNameOpt.isEmpty()) { + basePath = Files.createTempDirectory("hdt-java-cat-tree"); + } else { + basePath = Path.of(baseNameOpt); + } + + Path futureHDTLocation = Optional.ofNullable(hdtFormat.get("loader.cattree.futureHDTLocation")).map(Path::of).orElse(null); + + FluxStopTripleStringIterator it = new FluxStopTripleStringIterator(iterator, fluxStop); + + List files = new ArrayList<>(); + + long gen = 0; + long cat = 0; + + Path hdtStore = basePath.resolve("hdt-store"); + Path hdtCatLocationPath = basePath.resolve("cat"); + String hdtCatLocation = hdtCatLocationPath.toAbsolutePath().toString(); + + Files.createDirectories(hdtStore); + Files.createDirectories(hdtCatLocationPath); + + boolean nextFile; + do { + // generate the hdt + gen++; + ProgressListener il = PrefixListener.of("gen#" + gen, listener); + Path hdtLocation = hdtStore.resolve("hdt-" + gen + ".hdt"); + supplier.doGenerateHDT(it, baseURI, hdtFormat, il, hdtLocation); + + nextFile = it.hasNextFlux(); + HDTFile hdtFile = new HDTFile(hdtLocation, 1); + + // merge the generated hdt with each block with enough size + while (!files.isEmpty() && (!nextFile || (files.get(files.size() - 1)).getChunks() <= hdtFile.getChunks())) { + HDTFile lastHDTFile = files.remove(files.size() - 1); + cat++; + ProgressListener ilc = PrefixListener.of("cat#" + cat, listener); + Path hdtCatFileLocation = hdtStore.resolve("hdtcat-" + cat + ".hdt"); + try (HDT abcat = HDTManager.catHDT( + hdtCatLocation, + lastHDTFile.getHdtFile().toAbsolutePath().toString(), + hdtFile.getHdtFile().toAbsolutePath().toString(), + hdtFormat, ilc)) { + abcat.saveToHDT(hdtCatFileLocation.toAbsolutePath().toString(), il); + } + // delete previous chunks + Files.delete(lastHDTFile.getHdtFile()); + Files.delete(hdtFile.getHdtFile()); + // note the new hdt file and the number of chunks + hdtFile = new HDTFile(hdtCatFileLocation, lastHDTFile.getChunks() + hdtFile.getChunks()); + } + files.add(hdtFile); + } while (nextFile); + + assert files.size() == 1; + + Path hdtFile = files.get(0).hdtFile; + + assert files.get(0).getChunks() == gen; + assert cat < gen; + + // if a future HDT location has been asked, move to it and map the HDT + if (futureHDTLocation != null) { + Files.deleteIfExists(futureHDTLocation); + Files.move(hdtFile, futureHDTLocation); + return HDTManager.mapHDT(futureHDTLocation.toAbsolutePath().toString()); + } + + // if no future location has been asked, load the HDT and delete it after + try { + return HDTManager.loadHDT(hdtFile.toAbsolutePath().toString()); + } finally { + Files.delete(hdtFile); + } + } + + private static class HDTFile { + private final Path hdtFile; + private final long chunks; + + public HDTFile(Path hdtFile, long chunks) { + this.hdtFile = hdtFile; + this.chunks = chunks; + } + + public long getChunks() { + return chunks; + } + + public Path getHdtFile() { + return hdtFile; + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java index f69f1bcb..f092cbbf 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java @@ -281,13 +281,9 @@ public void mapFromHDT(File f, long offset, ProgressListener listener) throws IO header.load(input, ci, iListener); // Set base URI. - try { - IteratorTripleString it = header.search("", HDTVocabulary.RDF_TYPE, HDTVocabulary.HDT_DATASET); - if(it.hasNext()) { - this.baseUri = it.next().getSubject().toString(); - } - } catch (NotFoundException e) { - log.error("Unexpected exception.", e); + this.baseUri = header.getBaseURI().toString(); + if (baseUri.isEmpty()) { + log.error("Empty base uri!"); } // Load dictionary @@ -611,7 +607,9 @@ public boolean isMapped() { * @param listener */ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { - System.out.println("Generating dictionary"); + if (listener != null) { + listener.notifyProgress(0, "Generating dictionary"); + } try (FourSectionDictionaryCat dictionaryCat = new FourSectionDictionaryCat(location)) { dictionaryCat.cat(hdt1.getDictionary(), hdt2.getDictionary(), listener); ControlInfo ci2 = new ControlInformation(); @@ -628,7 +626,9 @@ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) this.dictionary.close(); } this.dictionary = dictionary; - System.out.println("Generating triples"); + if (listener != null) { + listener.notifyProgress(0, "Generating triples"); + } BitmapTriplesIteratorCat it = new BitmapTriplesIteratorCat(hdt1.getTriples(), hdt2.getTriples(), dictionaryCat); BitmapTriplesCat bitmapTriplesCat = new BitmapTriplesCat(location); bitmapTriplesCat.cat(it, listener); @@ -664,13 +664,17 @@ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) Files.delete(Paths.get(location + "mapping_back_2")); Files.delete(Paths.get(location + "mapping_back_type_1")); Files.delete(Paths.get(location + "mapping_back_type_2")); - System.out.println("Generating header"); + if (listener != null) { + listener.notifyProgress(0, "Generating header"); + } this.header = HeaderFactory.createHeader(spec); - this.populateHeaderStructure("http://wdaqua.eu/hdtCat/"); + this.populateHeaderStructure(hdt1.getBaseURI()); } public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { - System.out.println("Generating dictionary"); + if (listener != null) { + listener.notifyProgress(0, "Generating dictionary"); + } try (DictionaryCat dictionaryCat = new MultipleSectionDictionaryCat(location)) { dictionaryCat.cat(hdt1.getDictionary(), hdt2.getDictionary(), listener); //map the generated dictionary @@ -686,7 +690,9 @@ public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener list this.dictionary = dictionary; } - System.out.println("Generating triples"); + if (listener != null) { + listener.notifyProgress(0, "Generating triples"); + } BitmapTriplesIteratorCat it = new BitmapTriplesIteratorCat(hdt1.getTriples(), hdt2.getTriples(), dictionaryCat); BitmapTriplesCat bitmapTriplesCat = new BitmapTriplesCat(location); bitmapTriplesCat.cat(it,listener); @@ -746,9 +752,11 @@ public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener list Files.delete(Paths.get(location+"mapping_back_2")); Files.delete(Paths.get(location+"mapping_back_type_1")); Files.delete(Paths.get(location+"mapping_back_type_2")); - System.out.println("Generating header"); + if (listener != null) { + listener.notifyProgress(0, "Generating header"); + } this.header = HeaderFactory.createHeader(spec); - this.populateHeaderStructure("http://wdaqua.eu/hdtCat/"); + this.populateHeaderStructure(hdt1.getBaseURI()); } public void diff(HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { @@ -827,7 +835,7 @@ public void diffBit(String location, HDT hdt, Bitmap deleteBitmap, ProgressListe il.notifyProgress(90, "Set header..."); this.header = HeaderFactory.createHeader(spec); - this.populateHeaderStructure("http://wdaqua.eu/hdtDiff/"); + this.populateHeaderStructure(hdt.getBaseURI()); log.debug("Diff completed."); il.notifyProgress(100, "Diff completed..."); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java index 7ea92113..17b88c80 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java @@ -77,7 +77,7 @@ public PlainHeader(HDTOptions spec) { public void insert(CharSequence subject, CharSequence predicate, CharSequence object) { String objStr = object.toString(); if(objStr.charAt(0)=='<'|| objStr.charAt(0)=='"' || objStr.startsWith("http://")||objStr.startsWith("file://")) { - triples.add(new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), object)); + triples.add(new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), HeaderUtil.cleanURI(object))); } else { triples.add(new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), '"'+objStr+'"')); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FluxStopTripleStringIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FluxStopTripleStringIterator.java new file mode 100644 index 00000000..a1e5ee38 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FluxStopTripleStringIterator.java @@ -0,0 +1,65 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.rdf.RDFFluxStop; +import org.rdfhdt.hdt.triples.TripleString; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +public class FluxStopTripleStringIterator implements Iterator { + private TripleString next; + private final Iterator iterator; + private final RDFFluxStop fluxStop; + private boolean stop; + + public FluxStopTripleStringIterator(Iterator iterator, RDFFluxStop fluxStop) { + this.iterator = iterator; + this.fluxStop = fluxStop; + } + + @Override + public boolean hasNext() { + if (stop) { + return false; + } + if (next != null) { + return true; + } + + if (!iterator.hasNext()) { + return false; + } + + next = iterator.next(); + + if (!fluxStop.canHandle(next)) { + stop = true; + return false; + } + + return true; + } + + /** + * @return if a new flux can be extracted, will restart the flux + */ + public boolean hasNextFlux() { + stop = false; + fluxStop.restart(); + return hasNext(); + } + + @Override + public TripleString next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + try { + return next; + } finally { + next = null; + } + } + + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java new file mode 100644 index 00000000..ca933bc9 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java @@ -0,0 +1,44 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.function.Function; + +/** + * Iterator to map a value to another + * @param origin type + * @param return type + * @author Antoine Willerval + */ +public class MapIterator implements Iterator { + private final MapWithIdFunction mappingFunction; + private final Iterator base; + private long index; + + public MapIterator(Iterator base, Function mappingFunction) { + this(base, (m, i) -> mappingFunction.apply(m)); + } + public MapIterator(Iterator base, MapWithIdFunction mappingFunction) { + this.base = base; + this.mappingFunction = mappingFunction; + } + + @Override + public boolean hasNext() { + return base.hasNext(); + } + + @Override + public N next() { + return mappingFunction.apply(base.next(), index++); + } + + @Override + public void remove() { + base.remove(); + } + + @FunctionalInterface + public interface MapWithIdFunction { + E apply(T element, long index); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java new file mode 100644 index 00000000..2c279575 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java @@ -0,0 +1,198 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; +import java.util.function.Function; + +/** + * a utility class to create an iterator from the value returned by another Thread + * + * @param the iterator type + * @author Antoine Willerval + */ + +public class PipedCopyIterator implements Iterator { + /** + * RuntimeException generated by the PipedCopyIterator + * + * @author Antoine Willerval + */ + public static class PipedIteratorException extends RuntimeException { + public PipedIteratorException(String message, Throwable t) { + super(message, t); + } + } + + + /** + * Callback for the {@link #createOfCallback(PipedCopyIterator.Parser, PipedCopyIterator.PipeCallBack)} method + * + * @param the iterator type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface PipeCallBack { + /** + * method called from the new thread to generate the new data, at the end of the callback, the pipe is closed + * with or without exception + * + * @param pipe the pipe to fill + * @throws Exception any exception returned by the generator + */ + void createPipe(PipedCopyIterator pipe) throws Exception; + } + + /** + * create a piped iterator from a callback runner, the call to the callback should be made in the callbackRunner + * + * @param serializer serializer to pass the data + * @param callbackRunner the callback runner + * @param type of the iterator + * @return the iterator + */ + public static PipedCopyIterator createOfCallback(Parser serializer, PipeCallBack callbackRunner) { + PipedCopyIterator pipe = new PipedCopyIterator<>(serializer); + + Thread thread = new Thread(() -> { + try { + callbackRunner.createPipe(pipe); + pipe.closePipe(); + } catch (Throwable e) { + pipe.closePipe(e); + } + }, "PipeIterator"); + thread.start(); + + return pipe; + } + public interface Parser { + static void writeString(CharSequence s, OutputStream out) throws IOException { + byte[] bytes = s.toString().getBytes(StandardCharsets.UTF_8); + VByte.encode(out, bytes.length); + out.write(bytes); + } + static String readString(InputStream in) throws IOException { + int size = (int) VByte.decode(in); + byte[] bytes = IOUtil.readBuffer(in, size, null); + return new String(bytes, StandardCharsets.UTF_8); + } + void write(T t, OutputStream stream) throws IOException; + T read(InputStream stream) throws IOException; + } + + private final PipedInputStream in; + private final PipedOutputStream out; + private final Parser serializer; + private T next; + private boolean end; + private PipedIteratorException exception; + + public PipedCopyIterator(Parser serializer) { + this.serializer = serializer; + try { + in = new PipedInputStream(); + out = new PipedOutputStream(); + in.connect(out); + } catch (IOException e) { + throw new PipedIteratorException("can't connect pipe", e); + } + } + private int readByte() { + try { + return in.read(); + } catch (IOException e) { + throw new PipedIteratorException("Can't read byte", e); + } + } + + @Override + public boolean hasNext() { + if (end) { + return false; + } + if (next != null) { + return true; + } + + int b = readByte(); + if (b == 0) { + end = true; + if (exception != null) { + throw exception; + } + return false; + } + try { + next = serializer.read(in); + } catch (IOException e) { + throw new PipedIteratorException("Can't read pipe", e); + } + return true; + } + + @Override + public T next() { + if (!hasNext()) { + return null; + } + T next = this.next; + this.next = null; + return next; + } + + public void closePipe() { + closePipe(null); + } + public void closePipe(Throwable e) { + if (e != null) { + if (e instanceof PipedIteratorException) { + this.exception = (PipedIteratorException) e; + } else { + this.exception = new PipedIteratorException("closing exception", e); + } + } + try { + // end byte + out.write(0); + } catch (IOException ee) { + throw new PipedIteratorException("Can't close pipe", ee); + } + } + + /** + * map this iterator to another type + * @param mappingFunction the mapping function + * @param the future type + * @return mapped iterator + */ + public Iterator map(Function mappingFunction) { + return new MapIterator<>(this, mappingFunction); + } + /** + * map this iterator to another type + * @param mappingFunction the mapping function + * @param the future type + * @return mapped iterator + */ + public Iterator mapWithId(MapIterator.MapWithIdFunction mappingFunction) { + return new MapIterator<>(this, mappingFunction); + } + + public void addElement(T node) { + try { + // not end byte + out.write(1); + serializer.write(node, out); + } catch (IOException ee) { + throw new PipedIteratorException("Can't add element to pipe", ee); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java index 246396c9..40bde70d 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java @@ -29,6 +29,7 @@ import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; import org.rdfhdt.hdt.rdf.parsers.RDFParserDir; import org.rdfhdt.hdt.rdf.parsers.RDFParserHDT; import org.rdfhdt.hdt.rdf.parsers.RDFParserList; @@ -37,6 +38,12 @@ import org.rdfhdt.hdt.rdf.parsers.RDFParserSimple; import org.rdfhdt.hdt.rdf.parsers.RDFParserTar; import org.rdfhdt.hdt.rdf.parsers.RDFParserZip; +import org.rdfhdt.hdt.triples.TripleString; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Iterator; /** * @author mario.arias @@ -78,4 +85,34 @@ public static RDFParserCallback getParserCallback(RDFNotation notation, boolean throw new NotImplementedException("Parser not found for notation: "+notation); } + + /** + * convert a stream to a triple iterator + * @param parser the parser to convert the stream + * @param stream the stream to parse + * @param baseUri the base uri to parse + * @param notation the rdf notation to parse + * @return iterator + */ + public static Iterator readAsIterator(RDFParserCallback parser, InputStream stream, String baseUri, boolean keepBNode, RDFNotation notation) { + return PipedCopyIterator.createOfCallback(TripleStringParser.INSTANCE, pipe -> parser.doParse(stream, baseUri, notation, keepBNode, (triple, pos) -> pipe.addElement(triple))); + } + + private static class TripleStringParser implements PipedCopyIterator.Parser { + private static final TripleStringParser INSTANCE = new TripleStringParser(); + @Override + public void write(TripleString tripleString, OutputStream stream) throws IOException { + PipedCopyIterator.Parser.writeString(tripleString.getSubject(), stream); + PipedCopyIterator.Parser.writeString(tripleString.getPredicate(), stream); + PipedCopyIterator.Parser.writeString(tripleString.getObject(), stream); + } + + @Override + public TripleString read(InputStream stream) throws IOException { + String s = PipedCopyIterator.Parser.readString(stream); + String p = PipedCopyIterator.Parser.readString(stream); + String o = PipedCopyIterator.Parser.readString(stream); + return new TripleString(s, p, o); + } + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorCat.java index a6a071d9..810aab43 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorCat.java @@ -130,9 +130,6 @@ public TripleID next() { list = getTripleID(count).listIterator(); count ++; - if (count%100000==0){ - System.out.println(count); - } return list.next(); } } @@ -199,4 +196,4 @@ private long mapIdPredicate(long id, CatMapping catMapping){ } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java new file mode 100644 index 00000000..c215e2d2 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java @@ -0,0 +1,198 @@ +package org.rdfhdt.hdt.util.concurrent; + +import java.util.Objects; + +/** + * Thread allowing exception and returning it when joining it with {@link #joinAndCrashIfRequired()} or by using + * {@link #getException()}, can be attached to other threads to crash the others if an exception occurs in one of + * them with {@link #attach(ExceptionThread...)}. + * + * @author Antoine Willerval + */ +public class ExceptionThread extends Thread { + /** + * create exception threads of multiple runnables + * + * @param name common name + * @param runnables the runnables list, can't be empty + * @return exception thread attached with other runnables + * @throws java.lang.IllegalArgumentException if the array is empty + * @throws java.lang.NullPointerException if an argument is null + */ + public static ExceptionThread async(String name, ExceptionRunnable... runnables) { + Objects.requireNonNull(name, "name can't be null!"); + Objects.requireNonNull(runnables, "runnables can't be null"); + for (int i = 0; i < runnables.length; i++) { + Objects.requireNonNull(runnables[i], "runnable#" + i + " is null!"); + } + if (runnables.length == 0) { + throw new IllegalArgumentException("empty runnable list"); + } + + ExceptionThread thread = new ExceptionThread(runnables[0], name + "#" + 0); + + for (int i = 1; i < runnables.length; i++) { + thread.attach(new ExceptionThread(runnables[i], name + "#" + i)); + } + + return thread; + } + + + /** + * Version of {@link java.lang.Runnable} with an exception + */ + @FunctionalInterface + public interface ExceptionRunnable { + /** + * Runnable used in an {@link org.rdfhdt.hdt.util.concurrent.ExceptionThread}, can throw an exception + * + * @see org.rdfhdt.hdt.util.concurrent.ExceptionThread#ExceptionThread(org.rdfhdt.hdt.util.concurrent.ExceptionThread.ExceptionRunnable, String) + * @throws java.lang.Exception if any + */ + void run() throws Exception; + } + + private Throwable exception = null; + private final ExceptionRunnable target; + private ExceptionThread next; + private ExceptionThread prev; + + public ExceptionThread(ExceptionRunnable target, String name) { + super(name); + this.target = target; + } + + /** + * attach another threads to wait with this one + * + * @param threads others + * @return this + */ + public ExceptionThread attach(ExceptionThread... threads) { + Objects.requireNonNull(threads, "can't attach null thread"); + for (ExceptionThread thread : threads) { + if (thread.prev != null) { + throw new IllegalArgumentException("Thread " + thread.getName() + " already attached"); + } + if (this.next != null) { + this.next.attach(thread); + continue; + } + this.next = thread; + thread.prev = this; + } + return this; + } + + /** + * start this thread and all attached thread + * + * @return this + */ + public ExceptionThread startAll() { + ExceptionThread prev = this.prev; + while (prev != null) { + prev.start(); + prev = prev.prev; + } + start(); + ExceptionThread next = this.next; + while (next != null) { + next.start(); + next = next.next; + } + return this; + } + + @Override + public final void run() { + try { + target.run(); + } catch (Throwable t) { + if (exception != null) { + exception.addSuppressed(t); + return; // another attached thread crashed, probably interruption exception + } + exception = t; + if (this.next != null) { + this.next.interruptForward(t); + } + if (this.prev != null) { + this.prev.interruptBackward(t); + } + } + } + + private void interruptBackward(Throwable t) { + exception = t; + if (this.prev != null) { + this.prev.interruptBackward(t); + } + interrupt(); + } + + private void interruptForward(Throwable t) { + exception = t; + if (this.next != null) { + this.next.interruptForward(t); + } + interrupt(); + } + + /** + * @return the exception returned by this thread, another attached thread or null if no exception occurred + */ + public Throwable getException() { + return exception; + } + + /** + * join this thread and create an exception if required, will convert it to a runtime exception if it can't be + * created. If the thread returned an exception while the current thread is interrupted, the exception will be + * suppressed in the {@link java.lang.InterruptedException}. + * + * @throws InterruptedException interruption while joining the thread + * @throws ExceptionThreadException if the thread or any attached thread returned an exception + */ + public void joinAndCrashIfRequired() throws InterruptedException { + try { + join(); + ExceptionThread next = this.next; + while (next != null) { + next.join(); + next = next.next; + } + ExceptionThread prev = this.prev; + while (prev != null) { + prev.join(); + prev = prev.prev; + } + } catch (InterruptedException ie) { + // we got an exception in the thread while this thread was interrupted + if (exception != null) { + ie.addSuppressed(exception); + } + throw ie; + } + if (exception == null) { + return; + } + if (exception instanceof ExceptionThreadException) { + throw (ExceptionThreadException) exception; + } + throw new ExceptionThreadException(exception); + } + + /** + * Exception returned by {@link #joinAndCrashIfRequired()}, will always have a cause + * + * @author Antoine Willerval + */ + public static class ExceptionThreadException extends RuntimeException { + public ExceptionThreadException(Throwable cause) { + super(cause); + } + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixListener.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixListener.java new file mode 100644 index 00000000..af452c38 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixListener.java @@ -0,0 +1,42 @@ +package org.rdfhdt.hdt.util.listener; + +import org.rdfhdt.hdt.listener.ProgressListener; + +/** + * Simple {@link org.rdfhdt.hdt.listener.ProgressListener} implementation concatenating a prefix to each message + * + * @author Antoine Willerval + */ +public class PrefixListener implements ProgressListener { + /** + * create a prefix listener from another listener + * + * @param prefix prefix to concat to the messages + * @param listener the listener + * @return null if listener is null, listener if prefix is null or empty or a prefix listener + */ + public static ProgressListener of(String prefix, ProgressListener listener) { + if (listener == null) { + return null; + } + + if (prefix == null || prefix.isEmpty()) { + return listener; + } + + return new PrefixListener(prefix, listener); + } + + private final String prefix; + private final ProgressListener listener; + + private PrefixListener(String prefix, ProgressListener listener) { + this.prefix = prefix; + this.listener = listener; + } + + @Override + public void notifyProgress(float level, String message) { + listener.notifyProgress(level, prefix + message); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java new file mode 100644 index 00000000..bc7455bd --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java @@ -0,0 +1,208 @@ +package org.rdfhdt.hdt.hdtCat; + +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Suite; +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.NotFoundException; +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdt.HDTSupplier; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.rdf.RDFFluxStop; +import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; +import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; +import org.rdfhdt.hdt.util.StopWatch; +import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +@RunWith(Suite.class) +@Suite.SuiteClasses({ + HDTCatTreeTest.DynamicTest.class +}) +public class HDTCatTreeTest { + private static class HDTManagerTestBase extends AbstractMapMemoryTest implements ProgressListener { + protected static final long SIZE = 1L << 15; + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + protected Path workDir; + protected HDTSpecification spec; + + @Before + public void setupManager() throws IOException { + spec = new HDTSpecification(); + workDir = tempDir.newFolder().toPath(); + spec.set("loader.cattree.location", workDir.toAbsolutePath().toString()); + } + + @Override + public void notifyProgress(float level, String message) { + // System.out.println("[" + level + "] " + message); + } + + protected void assertEqualsHDT(HDT expected, HDT actual, int ignoredHeader) throws NotFoundException { + + // test dictionary + Dictionary ed = expected.getDictionary(); + Dictionary ad = actual.getDictionary(); + assertEqualsHDT("Subjects", ed.getSubjects(), ad.getSubjects()); + assertEqualsHDT("Predicates", ed.getPredicates(), ad.getPredicates()); + assertEqualsHDT("Objects", ed.getObjects(), ad.getObjects()); + assertEqualsHDT("Shared", ed.getShared(), ad.getShared()); + assertEquals(ed.getType(), ad.getType()); + + // test triples + IteratorTripleString actualIt = actual.search("", "", ""); + IteratorTripleString expectedIt = expected.search("", "", ""); + + while (expectedIt.hasNext()) { + assertTrue(actualIt.hasNext()); + + TripleString expectedTriple = expectedIt.next(); + TripleString actualTriple = actualIt.next(); + assertEquals(expectedIt.getLastTriplePosition(), actualIt.getLastTriplePosition()); + assertEquals(expectedTriple, actualTriple); + } + assertFalse(actualIt.hasNext()); + + // test header + assertEquals(expected.getHeader().getBaseURI(), actual.getHeader().getBaseURI()); + assertEquals(expected.getHeader().getNumberOfElements() + ignoredHeader, actual.getHeader().getNumberOfElements()); + } + + protected void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { + Iterator itEx = excepted.getSortedEntries(); + Iterator itAc = actual.getSortedEntries(); + Comparator csc = CharSequenceComparator.getInstance(); + + while (itEx.hasNext()) { + assertTrue(itAc.hasNext()); + CharSequence expectedTriple = itEx.next(); + CharSequence actualTriple = itAc.next(); + assertEquals(section + " section strings", 0, csc.compare(expectedTriple, actualTriple)); + } + assertFalse(itAc.hasNext()); + assertEquals(excepted.getNumberOfElements(), actual.getNumberOfElements()); + } + } + + @RunWith(Parameterized.class) + public static class DynamicTest extends HDTManagerTestBase { + + @Parameterized.Parameters(name = "{0}") + public static Collection params() { + return List.of( + new Object[]{"base", SIZE * 16, 20, 50, false}, + new Object[]{"duplicates", SIZE * 16, 10, 50, false}, + new Object[]{"large-literals", SIZE * 4, 20, 250, false}, + new Object[]{"quiet", SIZE * 16, 10, 50, false} + ); + } + + @Parameterized.Parameter + public String name; + @Parameterized.Parameter(1) + public long maxSize; + @Parameterized.Parameter(2) + public int maxElementSplit; + @Parameterized.Parameter(3) + public int maxLiteralSize; + @Parameterized.Parameter(4) + public boolean quiet; + + @Test + public void catTreeTest() throws IOException, ParserException, NotFoundException, InterruptedException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create DISK HDT + LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(); + HDT actual = null; + try { + actual = HDTManager.catTree( + RDFFluxStop.sizeLimit(SIZE), + HDTSupplier.memory(), + genActual.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + quiet ? null : this + ); + } finally { + if (actual == null) { + genActual.getThread().interrupt(); + } + } + genActual.getThread().joinAndCrashIfRequired(); + + supplier.reset(); + + Iterator genExpected = supplier.createTripleStringStream(); + // create MEMORY HDT + HDT expected = HDTManager.generateHDT( + genExpected, + HDTTestUtils.BASE_URI, + spec, + null + ); + + // happy compiler, should throw before + assertNotNull(expected); + assertNotNull(actual); + try { + assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + } finally { + IOUtil.closeAll(expected, actual); + } + } + + } + + @Ignore("handTests") + public static class HandTest extends HDTManagerTestBase { + @Test + public void bigTest() throws ParserException, IOException { + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(10_000_000_000L, 94); + + HDTOptions spec = new HDTSpecification(); + StopWatch watch = new StopWatch(); + watch.reset(); + try (HDT hdt = HDTManager.catTree(RDFFluxStop.sizeLimit(1_000_000_000), HDTSupplier.memory(), + supplier.createTripleStringStream(), HDTTestUtils.BASE_URI, spec, + (level, message) -> System.out.println("[" + level + "] " + message) + )) { + System.out.println(watch.stopAndShow()); + System.out.println(hdt.getTriples().getNumberOfElements()); + } + } + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index 738c9e96..ca372111 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -6,19 +6,24 @@ import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.string.ByteStringUtil; import java.io.FileWriter; import java.io.IOException; +import java.io.InputStream; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.io.PrintStream; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Iterator; import java.util.Random; public class LargeFakeDataSetStreamSupplier { - private static final Charset DEFAULT_CHARSET = Charset.defaultCharset(); + private static final Charset DEFAULT_CHARSET = ByteStringUtil.STRING_ENCODING; /** * create a lowercase name from a number, to create string without any number in it @@ -64,6 +69,7 @@ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long m private final long maxSize; private final long maxTriples; public int maxFakeType = 10; + public int maxLiteralSize = 2; public int maxElementSplit = Integer.MAX_VALUE; private LargeFakeDataSetStreamSupplier(long maxSize, long maxTriples, long seed) { @@ -89,8 +95,26 @@ public void createNTFile(String file) throws IOException { } } + public ThreadedStream createNTInputStream() throws IOException { + PipedOutputStream pout = new PipedOutputStream(); + InputStream is = new PipedInputStream(pout); + + ExceptionThread run = new ExceptionThread(() -> { + try (PrintStream ps = new PrintStream(pout, true)) { + Iterator it = createTripleStringStream(); + while (it.hasNext()) { + it.next().dumpNtriple(ps); + } + } + }, + "ThreadedFakedStream"); + run.start(); + + return new ThreadedStream(run, is); + } + public HDT createFakeHDTTwoPass(HDTOptions spec) throws ParserException, IOException { - Path f = Paths.get("tempNtFile.nt").toAbsolutePath(); + Path f = Path.of("tempNtFile.nt").toAbsolutePath(); try { createNTFile(f.toString()); spec.set("loader.type", "two-pass"); @@ -130,14 +154,22 @@ private CharSequence createValue() { if (random.nextBoolean()) { return createPredicate(); } - - String text = "\"" + stringNameOfInt(random.nextInt(maxElementSplit)) + "\""; - if (random.nextBoolean()) { + int size = random.nextInt(maxLiteralSize); + StringBuilder litText = new StringBuilder(); + for (int i = 0; i < size; i++) { + litText.append(stringNameOfInt(random.nextInt(maxElementSplit))).append(" "); + } + String text = "\"" + litText + "\""; + int litType = random.nextInt(3); + if (litType == 1) { // language node return text + "@" + stringNameOfInt(random.nextInt(maxElementSplit)); - } else { + } else if (litType == 2) { // typed node return text + "^^<" + createType() + ">"; + } else { + // no type/language node + return text; } } @@ -185,4 +217,37 @@ public TripleString next() { } } -} \ No newline at end of file + public LargeFakeDataSetStreamSupplier withMaxFakeType(int maxFakeType) { + this.maxFakeType = maxFakeType; + return this; + } + + public LargeFakeDataSetStreamSupplier withMaxElementSplit(int maxElementSplit) { + this.maxElementSplit = maxElementSplit; + return this; + } + + public LargeFakeDataSetStreamSupplier withMaxLiteralSize(int maxLiteralSize) { + this.maxLiteralSize = maxLiteralSize; + return this; + + } + + public static class ThreadedStream { + private final ExceptionThread thread; + private final InputStream stream; + + public ThreadedStream(ExceptionThread thread, InputStream stream) { + this.thread = thread; + this.stream = stream; + } + + public ExceptionThread getThread() { + return thread; + } + + public InputStream getStream() { + return stream; + } + } +} From 2b32171b89f3bd4be7ee1e9940d19f3a235c8987 Mon Sep 17 00:00:00 2001 From: qaate47 Date: Thu, 15 Sep 2022 16:46:39 +0200 Subject: [PATCH 2/9] add HDTGenerateDisk method with tests --- .../org/rdfhdt/hdt/enums/CompressionType.java | 52 ++ .../java/org/rdfhdt/hdt/hdt/HDTManager.java | 152 ++++ .../java/org/rdfhdt/hdt/hdt/HDTSupplier.java | 13 +- .../hdt/listener/MultiThreadListener.java | 48 ++ .../rdfhdt/hdt/options/HDTOptionsKeys.java | 103 +++ .../java/org/rdfhdt/hdt/rdf/RDFFluxStop.java | 44 ++ .../org/rdfhdt/hdt/rdf/RDFParserCallback.java | 1 + .../org/rdfhdt/hdt/triples/TripleString.java | 12 + .../org/rdfhdt/hdt/util/UnicodeEscape.java | 4 + .../java/org/rdfhdt/hdt/tools/HDTVerify.java | 25 +- .../java/org/rdfhdt/hdt/tools/RDF2HDT.java | 120 ++- .../listener/MultiThreadListenerConsole.java | 104 +++ .../compact/bitmap/AppendableWriteBitmap.java | 175 +++++ .../hdt/compact/sequence/DynamicSequence.java | 19 +- .../sequence/SequenceLog64BigDisk.java | 8 +- .../hdt/dictionary/DictionaryPrivate.java | 6 +- .../hdt/dictionary/impl/BaseDictionary.java | 9 +- .../impl/CompressFourSectionDictionary.java | 232 ++++++ .../impl/FourSectionDictionary.java | 27 +- .../impl/FourSectionDictionaryBig.java | 14 + .../impl/MultipleBaseDictionary.java | 7 + .../impl/MultipleSectionDictionary.java | 7 + .../impl/WriteFourSectionDictionary.java | 106 +++ .../section/OneReadDictionarySection.java | 76 ++ .../impl/section/WriteDictionarySection.java | 155 ++++ .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 299 +++++++- .../java/org/rdfhdt/hdt/hdt/impl/HDTBase.java | 166 +++++ .../java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java | 161 +--- .../org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java | 111 +++ .../impl/diskimport/CompressTripleMapper.java | 134 ++++ .../impl/diskimport/CompressionResult.java | 68 ++ .../diskimport/CompressionResultEmpty.java | 61 ++ .../diskimport/CompressionResultFile.java | 89 +++ .../diskimport/CompressionResultPartial.java | 142 ++++ .../impl/diskimport/SectionCompressor.java | 459 ++++++++++++ .../diskimport/TripleCompressionResult.java | 20 + .../TripleCompressionResultEmpty.java | 42 ++ .../TripleCompressionResultFile.java | 44 ++ .../TripleCompressionResultPartial.java | 64 ++ .../org/rdfhdt/hdt/header/PlainHeader.java | 22 +- .../iterator/utils/AsyncIteratorFetcher.java | 46 ++ .../hdt/iterator/utils/ExceptionIterator.java | 168 +++++ .../hdt/iterator/utils/FileChunkIterator.java | 105 +++ .../iterator/utils/FileTripleIDIterator.java | 17 + .../iterator/utils/FileTripleIterator.java | 37 + .../iterator/utils/MapExceptionIterator.java | 47 ++ .../utils/MergeExceptionIterator.java | 136 ++++ .../utils/NotificationExceptionIterator.java | 57 ++ .../hdt/iterator/utils/PipedCopyIterator.java | 112 ++- .../hdt/iterator/utils/SizeFetcher.java | 59 ++ .../org/rdfhdt/hdt/rdf/RDFParserFactory.java | 21 +- .../org/rdfhdt/hdt/triples/IndexedNode.java | 39 + .../org/rdfhdt/hdt/triples/IndexedTriple.java | 43 ++ .../org/rdfhdt/hdt/triples/TempTriples.java | 5 - .../rdfhdt/hdt/triples/TriplesPrivate.java | 6 + .../hdt/triples/impl/BitmapTriples.java | 16 +- .../hdt/triples/impl/OneReadTempTriples.java | 243 ++++++ .../rdfhdt/hdt/triples/impl/TriplesList.java | 10 +- .../hdt/triples/impl/TriplesListLong.java | 10 +- .../hdt/triples/impl/WriteBitmapTriples.java | 252 +++++++ .../java/org/rdfhdt/hdt/util/BitUtil.java | 2 +- .../hdt/util/ParallelSortableArrayList.java | 189 +++++ .../java/org/rdfhdt/hdt/util/Profiler.java | 155 ++++ .../java/org/rdfhdt/hdt/util/Reference.java | 60 ++ .../util/concurrent/ExceptionFunction.java | 6 + .../util/concurrent/ExceptionSupplier.java | 6 + .../hdt/util/concurrent/HeightTree.java | 97 +++ .../hdt/util/concurrent/KWayMerger.java | 302 ++++++++ .../hdt/util/concurrent/SyncListener.java | 28 + .../hdt/util/concurrent/TreeWorker.java | 699 ++++++++++++++++++ .../org/rdfhdt/hdt/util/disk/LongArray.java | 24 + .../rdfhdt/hdt/util/disk/LongArrayDisk.java | 8 +- .../hdt/util/io/CloseMappedByteBuffer.java | 26 +- .../rdfhdt/hdt/util/io/CloseSuppressPath.java | 247 +++++++ .../java/org/rdfhdt/hdt/util/io/IOUtil.java | 236 ++++-- .../compress/CompressNodeMergeIterator.java | 26 + .../util/io/compress/CompressNodeReader.java | 90 +++ .../util/io/compress/CompressNodeWriter.java | 62 ++ .../compress/CompressTripleMergeIterator.java | 27 + .../io/compress/CompressTripleReader.java | 87 +++ .../io/compress/CompressTripleWriter.java | 61 ++ .../hdt/util/io/compress/CompressUtil.java | 202 +++++ .../io/compress/MapCompressTripleMerger.java | 244 ++++++ .../compress/NoDuplicateTripleIDIterator.java | 95 +++ .../hdt/util/io/compress/TripleGenerator.java | 28 + .../io/compress/WriteLongArrayBuffer.java | 224 ++++++ .../util/listener/IntermediateListener.java | 53 +- .../hdt/util/listener/ListenerUtil.java | 23 +- .../listener/PrefixMultiThreadListener.java | 24 + .../hdt/util/string/ByteStringUtil.java | 2 + .../hdt/util/string/ReplazableString.java | 22 +- .../bitmap/AppendableWriteBitmapTest.java | 51 ++ .../hdt/compact/sequence/SequenceTest.java | 132 ++++ .../CompressFourSectionDictionaryTest.java | 177 +++++ .../section/OneReadDictionarySectionTest.java | 69 ++ .../org/rdfhdt/hdt/hdt/HDTManagerTest.java | 654 ++++++++++++++++ .../org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java | 208 ------ .../utils/MergeExceptionIteratorTest.java | 28 + .../util/LargeFakeDataSetStreamSupplier.java | 32 +- .../LargeFakeDataSetStreamSupplierTest.java | 48 ++ .../hdt/util/concurrent/KWayMergerTest.java | 169 +++++ .../hdt/util/concurrent/TreeWorkerTest.java | 308 ++++++++ .../org/rdfhdt/hdt/util/io/IOUtilTest.java | 132 +++- .../util/io/compress/CompressNodeTest.java | 257 +++++++ .../hdt/util/io/compress/CompressTest.java | 75 ++ .../util/io/compress/CompressTripleTest.java | 206 ++++++ .../hdtGenDisk/unicode_disk_encode.nt | 2 + 107 files changed, 10077 insertions(+), 656 deletions(-) create mode 100644 hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java create mode 100644 hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java create mode 100644 hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java create mode 100644 hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultEmpty.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultFile.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultPartial.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResult.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultEmpty.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultFile.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultPartial.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/AsyncIteratorFetcher.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileChunkIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIDIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapExceptionIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SizeFetcher.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedTriple.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/OneReadTempTriples.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/WriteBitmapTriples.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/ParallelSortableArrayList.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Reference.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionFunction.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionSupplier.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/HeightTree.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/KWayMerger.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncListener.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/TreeWorker.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArray.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeMergeIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleMergeIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleReader.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleWriter.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/MapCompressTripleMerger.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/NoDuplicateTripleIDIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/TripleGenerator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/WriteLongArrayBuffer.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixMultiThreadListener.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmapTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/compact/sequence/SequenceTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java delete mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIteratorTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/KWayMergerTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/TreeWorkerTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTripleTest.java create mode 100644 hdt-java-core/src/test/resources/hdtGenDisk/unicode_disk_encode.nt diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java b/hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java new file mode 100644 index 00000000..0b3dc6e9 --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java @@ -0,0 +1,52 @@ +package org.rdfhdt.hdt.enums; + +/** + * A compression type + * @author Antoine Willerval + */ +public enum CompressionType { + + /** + * gzip compression (.gz .tgz) + */ + GZIP("gz", "tgz"), + /** + * bzip compression (.bz2 .bz) + */ + BZIP("bz2", "bz"), + /** + * bzip compression (.xz) + */ + XZ("xz"), + /** + * no compression + */ + NONE; + + /** + * try to guess a compression of a file with its name + * @param fileName the file name to guess + * @return the compression type or none if it can't be guessed + */ + public static CompressionType guess(String fileName) { + String str = fileName.toLowerCase(); + + int idx = str.lastIndexOf('.'); + if(idx!=-1) { + String ext = str.substring(idx + 1); + for (CompressionType type: values()) { + for (String typeExt : type.ext) { + if (typeExt.equals(ext)) { + return type; + } + } + } + } + return NONE; + } + + private final String[] ext; + CompressionType(String... ext) { + this.ext = ext; + } +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java index da45017b..78c1127d 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java @@ -6,6 +6,7 @@ import java.util.Iterator; import org.rdfhdt.hdt.compact.bitmap.Bitmap; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.listener.ProgressListener; @@ -291,6 +292,153 @@ public static HDT generateHDT(String rdfFileName, String baseURI, RDFNotation rd public static HDT generateHDT(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { return HDTManager.getInstance().doGenerateHDT(iterator, baseURI, hdtFormat, listener); } + /** + * Create an HDT file from a RDF stream. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param filename the RDF file name to guess the stream format and compresion. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDT(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener); + } + /** + * Create an HDT file from a RDF stream. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...) + * @param compressionType Compression type of the RDF stream. (GZIP, ZIP...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener); + } + /** + * Create an HDT file from a RDF stream. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener); + } + + /** + * Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the + * IO usage. + * @param rdfFileName RDF file to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...) + * @param compressionType Compression type of the RDF file. (GZIP, ZIP...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the file cannot be found + * @throws ParserException when the RDF file can't be parsed + */ + public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, compressionType, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF file without compression by sorting the triples on disk, reduce the memory + * required by increasing the IO usage. + * @param rdfFileName RDF file to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the file cannot be found + * @throws ParserException when the RDF file can't be parsed + */ + public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the + * IO usage. Will guess the RDF file compression/format with the file name. + * @param rdfFileName RDF file to parse. + * @param baseURI Base URI for the dataset. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the file cannot be found + * @throws ParserException when the RDF file can't be parsed + */ + public static HDT generateHDTDisk(String rdfFileName, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, RDFNotation.guess(rdfFileName), CompressionType.guess(rdfFileName), hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing + * the IO usage. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param filename the RDF file name to guess the stream format and compresion. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDTDisk(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing + * the IO usage. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...) + * @param compressionType Compression type of the RDF stream. (GZIP, ZIP...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing + * the IO usage. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing + * the IO usage. + * @param baseURI Base URI for the dataset. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + */ + public static HDT generateHDTDisk(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener); + } public static TripleWriter getHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException { return HDTManager.getInstance().doGetHDTWriter(out, baseURI, hdtFormat); @@ -405,7 +553,11 @@ public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException; + protected abstract HDT doGenerateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; + protected abstract HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; + protected abstract HDT doGenerateHDTDisk(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; protected abstract TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException; protected abstract TripleWriter doGetHDTWriter(String outFile, String baseURI, HDTOptions hdtFormat) throws IOException; protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException; diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java index 7ad21ace..85f1821d 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java @@ -3,6 +3,7 @@ import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.TripleString; import java.io.IOException; @@ -19,7 +20,7 @@ public interface HDTSupplier { /** * @return implementation using in-memory hdt */ - static HDTSupplier memory() { + static org.rdfhdt.hdt.hdt.HDTSupplier memory() { return (iterator, baseURI, hdtFormat, listener, location) -> { try (HDT hdt = HDTManager.generateHDT(iterator, baseURI, hdtFormat, listener)) { hdt.saveToHDT(location.toAbsolutePath().toString(), listener); @@ -27,6 +28,16 @@ static HDTSupplier memory() { }; } + /** + * @return implementation using in-memory hdt + */ + static org.rdfhdt.hdt.hdt.HDTSupplier disk() { + return (iterator, baseURI, hdtFormat, listener, location) -> { + hdtFormat.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, location.toAbsolutePath().toString()); + HDTManager.generateHDTDisk(iterator, baseURI, hdtFormat, listener).close(); + }; + } + /** * Generate the HDT * diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java b/hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java new file mode 100644 index 00000000..a9014b80 --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java @@ -0,0 +1,48 @@ +package org.rdfhdt.hdt.listener; + +/** + * version of {@link org.rdfhdt.hdt.listener.ProgressListener} for multi-thread logging + */ +@FunctionalInterface +public interface MultiThreadListener extends ProgressListener { + + /** + * Send progress notification + * @param thread thread name + * @param level percent of the task accomplished + * @param message Description of the operation + */ + void notifyProgress(String thread, float level, String message); + + /** + * Send progress notification, should call {@link #notifyProgress(String, float, String)} + * @param level percent of the task accomplished + * @param message Description of the operation + */ + default void notifyProgress(float level, String message) { + notifyProgress(Thread.currentThread().getName(), level, message); + } + + /** + * unregister all the thread + */ + default void unregisterAllThreads() { + // should be filled by implementation if required + } + + /** + * register a thread + * @param threadName the thread name + */ + default void registerThread(String threadName) { + // should be filled by implementation if required + } + + /** + * unregister a thread + * @param threadName the thread name + */ + default void unregisterThread(String threadName) { + // should be filled by implementation if required + } +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java new file mode 100644 index 00000000..dd680697 --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java @@ -0,0 +1,103 @@ +package org.rdfhdt.hdt.options; + +/** + * keys usable with {@link org.rdfhdt.hdt.options.HDTOptions#set(String, String)} + * @author Antoine Willerval + */ +public class HDTOptionsKeys { + /** + * Key for the compression mode for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods. + * Value can be {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} or + * {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} + */ + public static final String LOADER_DISK_COMPRESSION_MODE_KEY = "loader.disk.compressMode"; + /** + * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower + * but decrease the RAM usage. default config. + */ + public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete"; + /** + * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next + * step, faster but increase the RAM usage. + */ + public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial"; + /** + * Key for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, + * say the number of workers to merge the data. default to the number of processor. long value. + */ + public static final String LOADER_DISK_COMPRESSION_WORKER_KEY = "loader.disk.compressWorker"; + /** + * Key for the maximum size of a chunk on disk for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk + * methods, the chunk should be in RAM before writing it on disk and should be sorted. long value. + */ + public static final String LOADER_DISK_CHUNK_SIZE_KEY = "loader.disk.chunkSize"; + /** + * Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, + * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to + * set this option to delete the directory in case of an interruption of the process. file value. + */ + public static final String LOADER_DISK_LOCATION_KEY = "loader.disk.location"; + /** + * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, + * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT + * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value. + */ + public static final String LOADER_DISK_FUTURE_HDT_LOCATION_KEY = "loader.disk.futureHDTLocation"; + /** + * Key for the maximum number of file opened at the same time, should be greater than {@link #LOADER_DISK_KWAY_KEY}, + * 1024 by default + */ + public static final String LOADER_DISK_MAX_FILE_OPEN_KEY = "loader.disk.maxFileOpen"; + /** + * Key for the number of chunk layers opened at the same time, by default + *

min(log2(maxFileOpen), chunkSize / (fileBufferSize * compressWorker))

+ */ + public static final String LOADER_DISK_KWAY_KEY = "loader.disk.kway"; + /** + * Key for the size of the buffers when opening a file + */ + public static final String LOADER_DISK_BUFFER_SIZE_KEY = "loader.disk.fileBufferSize"; + /** + * Key for the loading mode of a RDF file for the + * {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDT(String, String, org.rdfhdt.hdt.enums.RDFNotation, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)} + * method, this key isn't working with the other methods. + * Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS} or {@link #LOADER_TYPE_VALUE_TWO_PASS}. + */ + public static final String LOADER_TYPE_KEY = "loader.type"; + /** + * Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage + */ + public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass"; + /** + * Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value + */ + public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass"; + /** + * Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} catTree methods, + * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to + * set this option to delete the directory in case of an interruption of the process. file value. + */ + public static final String LOADER_CATTREE_LOCATION_KEY = "loader.cattree.location"; + /** + * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} catTree methods, + * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT + * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value. + */ + public static final String LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY = "loader.cattree.futureHDTLocation"; + /** + * Key for enabling the profiler (if implemented), default to false. Boolean value + */ + public static final String PROFILER_KEY = "profiler"; + /** + * Key for enabling the canonical NTriple file simple parser, default to false. Boolean value + */ + public static final String NT_SIMPLE_PARSER_KEY = "parser.ntSimpleParser"; + /** + * Key for setting the triple order. see {@link org.rdfhdt.hdt.enums.TripleComponentOrder}'s names to have the values + * default to {@link org.rdfhdt.hdt.enums.TripleComponentOrder#SPO} + */ + public static final String TRIPLE_ORDER_KEY = "triplesOrder"; + + + private HDTOptionsKeys() {} +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java index dc4f5de5..d3ce6f8b 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.function.BinaryOperator; /** * Rdf flux stopper descriptor @@ -99,4 +100,47 @@ public void restart() { * restart the flux stop */ void restart(); + + /** + * combine 2 rdf flux stop with a boolean operation + * @param fluxStop the other flux stop + * @param operator the operator + * @return rdffluxstop + * @see #and(RDFFluxStop) + * @see #or(RDFFluxStop) + */ + default RDFFluxStop booleanOp(RDFFluxStop fluxStop, BinaryOperator operator) { + return new RDFFluxStop() { + @Override + public boolean canHandle(TripleString ts) { + boolean left = RDFFluxStop.this.canHandle(ts); + boolean right = fluxStop.canHandle(ts); + return operator.apply(left, right); + } + + @Override + public void restart() { + RDFFluxStop.this.restart(); + fluxStop.restart(); + } + }; + } + + /** + * {@link #booleanOp(RDFFluxStop, BinaryOperator)} version for AND + * @param fluxStop other flux stop + * @return rdffluxstop + */ + default RDFFluxStop and(RDFFluxStop fluxStop) { + return booleanOp(fluxStop, (a, b) -> a && b); + } + + /** + * {@link #booleanOp(RDFFluxStop, BinaryOperator)} version for OR + * @param fluxStop other flux stop + * @return rdffluxstop + */ + default RDFFluxStop or(RDFFluxStop fluxStop) { + return booleanOp(fluxStop, (a, b) -> a || b); + } } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java index 99bfae3d..098f2d0f 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java @@ -40,6 +40,7 @@ * */ public interface RDFParserCallback { + @FunctionalInterface interface RDFCallback { void processTriple(TripleString triple, long pos); } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java index 8d4b4005..d581c8c6 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java @@ -337,4 +337,16 @@ public final void dumpNtriple(Appendable out) throws IOException { out.append('<').append(object).append("> .\n"); } } + + /** + * convert all the elements into {@link String} and create a new TripleString + * @return tripleString + */ + public TripleString tripleToString() { + return new TripleString( + subject.toString(), + predicate.toString(), + object.toString() + ); + } } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java index 88015230..42552eb8 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java @@ -100,6 +100,10 @@ public static void escapeString(String label, Appendable appendable) } } } + + if (last == label.length()) { + last--; + } for (int i = first; i <= last; i++) { char c = label.charAt(i); diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java index de34a168..15900c1c 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java @@ -15,9 +15,8 @@ public class HDTVerify { private HDTVerify() {} private static void print(byte[] arr) { - for (int i = 0; i < arr.length; i++) { - byte b = arr[i]; - System.out.print(String.format("%02X ", b)); + for (byte b : arr) { + System.out.printf("%02X ", b); } System.out.println(); } @@ -42,11 +41,11 @@ public static void checkDictionarySectionOrder(Iterator CharSequence charSeq = it.next(); String str = charSeq.toString(); - if(lastCharseq!=null && ((cmp=comparator.compare(lastCharseq, charSeq))>0 )) { + if(lastCharseq!=null && ((cmp=comparator.compare(lastCharseq, charSeq))>=0 )) { System.out.println("ERRA: "+lastCharseq+" / "+charSeq); } - if(lastStr!=null && ((cmp2=lastStr.compareTo(str))>0)) { + if(lastStr!=null && ((cmp2=lastStr.compareTo(str))>=0)) { System.out.println("ERRB: "+lastStr+" / "+str); } @@ -66,11 +65,15 @@ public static void main(String[] args) throws Throwable { System.out.println("hdtVerify "); System.exit(-1); } - HDT hdt = HDTManager.mapHDT(args[0], null); - - checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); - checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); - checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); - checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); + try (HDT hdt = HDTManager.mapHDT(args[0], null)) { + System.out.println("Checking subject entries"); + checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); + System.out.println("Checking predicate entries"); + checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); + System.out.println("Checking object entries"); + checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); + System.out.println("Checking shared entries"); + checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); + } } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index b330033b..89ed7d63 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -27,8 +27,10 @@ package org.rdfhdt.hdt.tools; import java.io.IOException; +import java.nio.file.Path; import java.util.List; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; @@ -36,14 +38,17 @@ import org.rdfhdt.hdt.hdt.HDTSupplier; import org.rdfhdt.hdt.hdt.HDTVersion; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.rdf.RDFFluxStop; +import org.rdfhdt.hdt.util.BitUtil; import org.rdfhdt.hdt.util.StopWatch; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.internal.Lists; import org.rdfhdt.hdt.util.StringUtil; +import org.rdfhdt.hdt.util.listener.MultiThreadListenerConsole; /** * @author mario.arias @@ -85,6 +90,12 @@ private static long getMaxTreeCatChunkSize() { @Parameter(names = "-quiet", description = "Do not show progress of the conversion") public boolean quiet; + @Parameter(names = "-disk", description = "Generate the HDT on disk to reduce memory usage") + public boolean disk; + + @Parameter(names = "-disklocation", description = "Location to run the generate disk, by default in a temporary directory, will be deleted after") + public String diskLocation; + @Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples") public boolean ntSimpleLoading; @@ -94,34 +105,49 @@ private static long getMaxTreeCatChunkSize() { @Parameter(names = "-cattreelocation", description = "Only with -cattree, set the tree building location") public String catTreeLocation; + private static long findBestMemoryChunkDiskMapTreeCat() { + Runtime runtime = Runtime.getRuntime(); + long maxRam = (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85) / 3; + + int shift = 0; + + while (shift != 63 && (1L << shift) * BitUtil.log2(1L << shift) < maxRam) { + shift++; + } + + // it will take at most "shift" bits per triple + // we divide by 3 for the 3 maps + return maxRam / shift; + } + public void execute() throws ParserException, IOException { HDTSpecification spec; - if(configFile!=null) { + if (configFile != null) { spec = new HDTSpecification(configFile); } else { spec = new HDTSpecification(); } - if(options!=null) { + if (options != null) { spec.setOptions(options); } - if(baseURI==null) { - baseURI = "file://"+rdfInput; + if (baseURI == null) { + baseURI = "file://" + rdfInput; } - RDFNotation notation=null; - if(rdfType!=null) { + RDFNotation notation = null; + if (rdfType != null) { try { notation = RDFNotation.parse(rdfType); } catch (IllegalArgumentException e) { - System.out.println("Notation "+rdfType+" not recognised."); + System.out.println("Notation " + rdfType + " not recognised."); } } - - if(notation==null) { + + if (notation == null) { try { - notation = RDFNotation.guess(rdfInput); + notation = RDFNotation.guess(rdfInput); } catch (IllegalArgumentException e) { - System.out.println("Could not guess notation for "+rdfInput+" Trying NTriples"); + System.out.println("Could not guess notation for " + rdfInput + " Trying NTriples"); notation = RDFNotation.NTRIPLES; } } @@ -141,19 +167,59 @@ public void execute() throws ParserException, IOException { long maxTreeCatChunkSize = getMaxTreeCatChunkSize(); - System.out.println("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true)); - - hdt = HDTManager.catTree( - RDFFluxStop.sizeLimit(maxTreeCatChunkSize), - HDTSupplier.memory(), - rdfInput, - baseURI, - notation, - spec, - this - ); + if (!quiet) { + System.out.println("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true)); + } + + if (disk) { + if (diskLocation != null) { + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, diskLocation); + if (!quiet) { + System.out.println("Using temp directory " + diskLocation); + } + } + MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole() : null; + hdt = HDTManager.catTree( + RDFFluxStop.countLimit(findBestMemoryChunkDiskMapTreeCat()), + HDTSupplier.disk(), + rdfInput, + baseURI, + notation, + spec, + listenerConsole + ); + if (listenerConsole != null) { + listenerConsole.notifyProgress(100, "done"); + } + } else { + hdt = HDTManager.catTree( + RDFFluxStop.sizeLimit(maxTreeCatChunkSize), + HDTSupplier.memory(), + rdfInput, + baseURI, + notation, + spec, + this + ); + } + } else if (disk) { + if (!quiet) { + System.out.println("Generating using generateHDTDisk"); + } + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, hdtOutput); + if (diskLocation != null) { + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, diskLocation); + if (!quiet) { + System.out.println("Using temp directory " + diskLocation); + } + } + MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole() : null; + hdt = HDTManager.generateHDTDisk(rdfInput, baseURI, notation, CompressionType.guess(rdfInput), spec, listenerConsole); + if (listenerConsole != null) { + listenerConsole.notifyProgress(100, "done"); + } } else { - hdt = HDTManager.generateHDT(rdfInput, baseURI, notation , spec, this); + hdt = HDTManager.generateHDT(rdfInput, baseURI, notation, spec, this); } System.out.println("File converted in: "+sw.stopAndShow()); @@ -168,13 +234,11 @@ public void execute() throws ParserException, IOException { } // Dump to HDT file - sw = new StopWatch(); - - if (!catTree) { - // ignore catTree save because the file is already here + if (!disk && !catTree) { + sw = new StopWatch(); hdt.saveToHDT(hdtOutput, this); + System.out.println("HDT saved to file in: "+sw.stopAndShow()); } - System.out.println("HDT saved to file in: "+sw.stopAndShow()); // Generate index and dump it to .hdt.index file sw.reset(); diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java new file mode 100644 index 00000000..915b1e8f --- /dev/null +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java @@ -0,0 +1,104 @@ +package org.rdfhdt.hdt.util.listener; + +import java.util.Map; +import java.util.TreeMap; + +import org.rdfhdt.hdt.listener.MultiThreadListener; + +public class MultiThreadListenerConsole implements MultiThreadListener { + private static final String ERASE_LINE = "\r\033[K"; + + private static String goBackNLine(int line) { + return "\033[" + line + "A"; + } + + /** + * true if the system allow ascii sequence, false otherwise + */ + private static final boolean ALLOW_ASCII_SEQUENCE; + + static { + String env; + try { + env = System.getenv("TERM"); + } catch (SecurityException e) { + env = null; + } + + ALLOW_ASCII_SEQUENCE = System.console() != null && !(env == null || env.isEmpty()); + } + + private final Map threadMessages; + private int previous; + + public MultiThreadListenerConsole() { + this(ALLOW_ASCII_SEQUENCE); + } + + public MultiThreadListenerConsole(boolean asciiListener) { + if (asciiListener) { + threadMessages = new TreeMap<>(); + } else { + threadMessages = null; + } + } + + @Override + public synchronized void unregisterAllThreads() { + if (threadMessages == null) { + return; + } + threadMessages.clear(); + notifyProgress(0, "-"); + } + + @Override + public synchronized void registerThread(String threadName) { + notifyProgress(threadName, 0, "-"); + } + + @Override + public synchronized void unregisterThread(String threadName) { + if (threadMessages == null) { + return; + } + threadMessages.remove(threadName); + render(); + } + + @Override + public synchronized void notifyProgress(String thread, float level, String message) { + String msg = "[" + level + "] " + message; + if (threadMessages != null) { + threadMessages.put(thread, msg); + render(); + } else { + System.out.println("[" + thread + "]" + msg); + } + } + + private void render() { + if (threadMessages == null) { + return; + } + StringBuilder message = new StringBuilder(); + int lines = threadMessages.size(); + message.append("\r"); + // go back each line of the thread message + if (previous != 0) { + message.append(goBackNLine(previous)); + } + // write each thread logs + threadMessages.forEach((thread, msg) -> { + message.append(ERASE_LINE).append("[").append(thread).append("]").append(msg).append("\n"); + }); + // remove previous printing + int toRemove = previous - lines; + if (toRemove > 0) { + message.append((ERASE_LINE+"\n").repeat(toRemove)).append(goBackNLine(toRemove)); + } + previous = lines; + + System.out.print(message); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java new file mode 100644 index 00000000..799aaca4 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java @@ -0,0 +1,175 @@ +package org.rdfhdt.hdt.compact.bitmap; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRC8; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; + +/** + * {@link org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap} implementation for only appending/one read saving into a + * buffer file + * @author Antoine Willerval + */ +public class AppendableWriteBitmap implements ModifiableBitmap, Closeable { + private long countZeros; + private long countOnes; + private long numbits; + private final CloseSuppressPath file; + private final CRCOutputStream stream; + private long currentElement; + private int bit; + private boolean saved; + + public AppendableWriteBitmap(CloseSuppressPath storage, int bufferSize) throws IOException { + file = storage; + stream = new CRCOutputStream(storage.openOutputStream(bufferSize), new CRC32()); + } + + @Override + public void set(long position, boolean value) { + throw new NotImplementedException(); + } + + @Override + public void append(boolean value) { + // count for stats + if (value) { + countOnes++; + } else { + countZeros++; + } + // increase the numbits + numbits++; + + // set the value + if (value) { + currentElement |= 1L << bit; + } + bit++; + + // write the value if required + try { + pushByte(false); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void pushByte(boolean force) throws IOException { + if (bit == 64 || force) { + BitUtil.writeLowerBitsByteAligned(currentElement, bit, stream); + // reset the current element writing + bit = 0; + currentElement = 0L; + } + } + + @Override + public boolean access(long position) { + throw new NotImplementedException(); + } + + @Override + public long rank1(long position) { + throw new NotImplementedException(); + } + + @Override + public long rank0(long position) { + throw new NotImplementedException(); + } + + @Override + public long selectPrev1(long start) { + throw new NotImplementedException(); + } + + @Override + public long selectNext1(long start) { + throw new NotImplementedException(); + } + + @Override + public long select0(long n) { + throw new NotImplementedException(); + } + + @Override + public long select1(long n) { + throw new NotImplementedException(); + } + + @Override + public long getNumBits() { + return numbits; + } + + @Override + public long countOnes() { + return countOnes; + } + + @Override + public long countZeros() { + return countZeros; + } + + @Override + public long getSizeBytes() { + return (numbits - 1) / 8 + 1; + } + + @Override + public void save(OutputStream output, ProgressListener listener) throws IOException { + saved = true; + // maybe a bit was already reading + pushByte(true); + // complete the file + stream.writeCRC(); + stream.close(); + + CRCOutputStream out = new CRCOutputStream(output, new CRC8()); + + // Write Type and Numbits + out.write(BitmapFactory.TYPE_BITMAP_PLAIN); + VByte.encode(out, numbits); + + // Write CRC + out.writeCRC(); + + // write the storage file, already contains the CRC + Files.copy(file.getJavaPath(), output); + + // delete the file + file.close(); + } + + @Override + public void load(InputStream input, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public String getType() { + return HDTVocabulary.BITMAP_TYPE_PLAIN; + } + + @Override + public void close() throws IOException { + if (!saved) { + IOUtil.closeAll(stream, file); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/DynamicSequence.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/DynamicSequence.java index 637e471f..6ebd9289 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/DynamicSequence.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/DynamicSequence.java @@ -27,21 +27,17 @@ package org.rdfhdt.hdt.compact.sequence; +import org.rdfhdt.hdt.util.disk.LongArray; + /** * @author mario.arias * */ -public interface DynamicSequence extends Sequence { - /** - * Set a new value at the specified position. - * @param index - * @param value - */ - void set(long index, long value); - +public interface DynamicSequence extends Sequence, LongArray { + /** * Append a new value after the last position, increasing the number of elements by one. - * @param value + * @param value the value to append */ void append(long value); @@ -55,4 +51,9 @@ public interface DynamicSequence extends Sequence { * Use advanced algorithm to reduce the size to the minimum, even if it is costly. */ void aggressiveTrimToSize(); + + @Override + default long length() { + return getNumberOfElements(); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64BigDisk.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64BigDisk.java index 0cbb2173..b2f4362d 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64BigDisk.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64BigDisk.java @@ -161,9 +161,9 @@ public long get(long position) { @Override public void set(long position, long value) { - //if(value<0 || value>maxvalue) { - //throw new IllegalArgumentException("Value exceeds the maximum for this data structure"); - //} + if (value<0 || value>maxvalue) { + throw new IllegalArgumentException("Value exceeds the maximum for this data structure"); + } //System.out.println("numbits "+this.numbits); setField(data, numbits, position, value); } @@ -296,4 +296,4 @@ public void close() throws IOException { } data=null; } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryPrivate.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryPrivate.java index ca11f961..c3b40ea9 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryPrivate.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryPrivate.java @@ -25,7 +25,11 @@ public interface DictionaryPrivate extends Dictionary { * Loads all information from another dictionary into this dictionary. */ void load(TempDictionary other, ProgressListener listener); - + /** + * same as {@link #load(TempDictionary, org.rdfhdt.hdt.listener.ProgressListener)} but read all the section at the same time + */ + void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException; + /** * Saves the dictionary to a OutputStream */ diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java index 1c2903c9..9786f03e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java @@ -30,8 +30,11 @@ import org.rdfhdt.hdt.dictionary.DictionaryPrivate; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.enums.DictionarySectionRole; import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.DelayedString; @@ -232,5 +235,9 @@ public TreeMap getAllObjects() { public long getNAllObjects() { throw new IllegalArgumentException("Method is not applicable on this dictionary"); } - + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + throw new NotImplementedException(); + } } \ No newline at end of file diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java new file mode 100644 index 00000000..b0084c7f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java @@ -0,0 +1,232 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.TempDictionarySection; +import org.rdfhdt.hdt.dictionary.impl.section.OneReadDictionarySection; +import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.iterator.utils.NotificationExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressUtil; +import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Comparator; + +/** + * Version of temp dictionary create the four sections from the SPO compressed sections result, should be loaded in a + * async way with {@link org.rdfhdt.hdt.dictionary.DictionaryPrivate#loadAsync(org.rdfhdt.hdt.dictionary.TempDictionary, org.rdfhdt.hdt.listener.ProgressListener)} + * @author Antoine Willerval + */ +public class CompressFourSectionDictionary implements TempDictionary { + private final ExceptionThread cfsdThread; + private final TempDictionarySection subject; + private final TempDictionarySection predicate; + private final TempDictionarySection object; + private final TempDictionarySection shared; + + private static void sendPiped(IndexedNode node, long index, PipedCopyIterator pipe, CompressUtil.DuplicatedIterator it, NodeConsumerMethod method) { + it.setLastHeader(index); + method.consume(node.getIndex(), index); + pipe.addElement(node.getNode().toString()); + } + + public CompressFourSectionDictionary(CompressionResult compressionResult, NodeConsumer nodeConsumer, ProgressListener listener) { + long splits = Math.max(20, compressionResult.getTripleCount() / 10_000); + // send duplicate to the consumer while reading the nodes + CompressUtil.DuplicatedIterator sortedSubject = + CompressUtil.asNoDupeCharSequenceIterator( + new NotificationExceptionIterator<>( + compressionResult.getSubjects(), + compressionResult.getTripleCount(), + splits, + "Subject section filling", + listener + ), + (originalIndex, duplicatedIndex, lastHeader) -> nodeConsumer.onSubject(duplicatedIndex, lastHeader) + ); + CompressUtil.DuplicatedIterator sortedPredicate = + CompressUtil.asNoDupeCharSequenceIterator( + new NotificationExceptionIterator<>( + compressionResult.getPredicates(), + compressionResult.getTripleCount(), + splits, + "Predicate section filling", + listener + ), + (originalIndex, duplicatedIndex, lastHeader) -> nodeConsumer.onPredicate(duplicatedIndex, lastHeader) + ); + CompressUtil.DuplicatedIterator sortedObject = + CompressUtil.asNoDupeCharSequenceIterator( + new NotificationExceptionIterator<>( + compressionResult.getObjects(), + compressionResult.getTripleCount(), + splits, + "Object section filling", + listener + ), + (originalIndex, duplicatedIndex, lastHeader) -> nodeConsumer.onObject(duplicatedIndex, lastHeader) + ); + long subjects = compressionResult.getSubjectsCount(); + long predicates = compressionResult.getPredicatesCount(); + long objects = compressionResult.getObjectsCount(); + long shareds = compressionResult.getSharedCount(); + + // iterator to pipe to the s p o sh + PipedCopyIterator subject = new PipedCopyIterator<>(); + PipedCopyIterator object = new PipedCopyIterator<>(); + PipedCopyIterator shared = new PipedCopyIterator<>(); + Comparator comparator = CharSequenceComparator.getInstance(); + cfsdThread = new ExceptionThread(() -> { + long sharedId = 1; + long subjectId = 1; + long objectId = 1; + sharedLoop: + while (sortedObject.hasNext() && sortedSubject.hasNext()) { + // last was a shared node + IndexedNode newSubject = sortedSubject.next(); + IndexedNode newObject = sortedObject.next(); + int comp = comparator.compare(newSubject.getNode(), newObject.getNode()); + while (comp != 0) { + if (comp < 0) { + sendPiped(newSubject, CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); + if (!sortedSubject.hasNext()) { + // no more subjects, send the current object and break the shared loop + sendPiped(newObject, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + break sharedLoop; + } + newSubject = sortedSubject.next(); + } else { + sendPiped(newObject, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + if (!sortedObject.hasNext()) { + // no more objects, send the current subject and break the shared loop + sendPiped(newSubject, CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); + break sharedLoop; + } + newObject = sortedObject.next(); + } + comp = comparator.compare(newSubject.getNode(), newObject.getNode()); + } + // shared element + long shid = CompressUtil.asShared(sharedId++); + sortedSubject.setLastHeader(shid); + sortedObject.setLastHeader(shid); + nodeConsumer.onSubject(newSubject.getIndex(), shid); + nodeConsumer.onObject(newObject.getIndex(), shid); + shared.addElement(newSubject.getNode().toString()); + } + // at least one iterator is empty, closing the shared pipe + shared.closePipe(); + // do we have subjects? + while (sortedSubject.hasNext()) { + sendPiped(sortedSubject.next(), CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); + } + subject.closePipe(); + // do we have objects? + while (sortedObject.hasNext()) { + sendPiped(sortedObject.next(), CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + } + object.closePipe(); + }, "CFSDPipeBuilder").startAll(); + + // send to the consumer the element while parsing them + this.subject = new OneReadDictionarySection(subject, subjects); + this.predicate = new OneReadDictionarySection(new MapIterator<>(sortedPredicate, (node, index) -> { + long header = CompressUtil.getHeaderId(index + 1); + sortedPredicate.setLastHeader(header); + nodeConsumer.onPredicate(node.getIndex(), header); + // force duplication because it's not made in a pipe like with the others + return node.getNode().toString(); + }), predicates); + this.object = new OneReadDictionarySection(object, objects); + this.shared = new OneReadDictionarySection(shared, shareds); + } + + @Override + public TempDictionarySection getSubjects() { + return subject; + } + + @Override + public TempDictionarySection getPredicates() { + return predicate; + } + + @Override + public TempDictionarySection getObjects() { + return object; + } + + @Override + public TempDictionarySection getShared() { + return shared; + } + + @Override + public void startProcessing() { + } + + @Override + public void endProcessing() { + } + + @Override + public long insert(CharSequence str, TripleComponentRole position) { + throw new NotImplementedException(); + } + + @Override + public void reorganize() { + // already organized + } + + @Override + public void reorganize(TempTriples triples) { + // already organized + } + + @Override + public boolean isOrganized() { + return true; + } + + @Override + public void clear() { + } + + @Override + public long stringToId(CharSequence subject, TripleComponentRole role) { + throw new NotImplementedException(); + } + + @Override + public void close() throws IOException { + try { + cfsdThread.interrupt(); + cfsdThread.joinAndCrashIfRequired(); + } catch (InterruptedException e) { + // normal + } + } + + public interface NodeConsumer { + void onSubject(long preMapId, long newMapId); + void onPredicate(long preMapId, long newMapId); + void onObject(long preMapId, long newMapId); + } + + private interface NodeConsumerMethod { + void consume(long id, long header); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary.java index 3fc8312d..3617fa7e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.util.concurrent.atomic.AtomicReference; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; import org.rdfhdt.hdt.dictionary.TempDictionary; @@ -44,6 +45,7 @@ import org.rdfhdt.hdt.options.ControlInfo.Type; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -55,7 +57,7 @@ */ public class FourSectionDictionary extends BaseDictionary { - public FourSectionDictionary(HDTOptions spec, + public FourSectionDictionary(HDTOptions spec, DictionarySectionPrivate s, DictionarySectionPrivate p, DictionarySectionPrivate o, DictionarySectionPrivate sh) { super(spec); this.subjects = s; @@ -63,7 +65,7 @@ public FourSectionDictionary(HDTOptions spec, this.objects = o; this.shared = sh; } - + public FourSectionDictionary(HDTOptions spec) { super(spec); // FIXME: Read type from spec. @@ -85,6 +87,19 @@ public void load(TempDictionary other, ProgressListener listener) { shared.load(other.getShared(), iListener); } + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + IntermediateListener iListener = new IntermediateListener(null); + new ExceptionThread(() -> predicates.load(other.getPredicates(), iListener), "FourSecSAsyncReaderP") + .attach( + new ExceptionThread(() -> subjects.load(other.getSubjects(), iListener), "FourSecSAsyncReaderS"), + new ExceptionThread(() -> shared.load(other.getShared(), iListener), "FourSecSAsyncReaderSh"), + new ExceptionThread(() -> objects.load(other.getObjects(), iListener), "FourSecSAsyncReaderO") + ) + .startAll() + .joinAndCrashIfRequired(); + } + /* (non-Javadoc) * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) */ @@ -111,7 +126,7 @@ public void load(InputStream input, ControlInfo ci, ProgressListener listener) t if(ci.getType()!=ControlInfo.Type.DICTIONARY) { throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); } - + IntermediateListener iListener = new IntermediateListener(listener); shared = DictionarySectionFactory.loadFrom(input, iListener); @@ -119,7 +134,7 @@ public void load(InputStream input, ControlInfo ci, ProgressListener listener) t predicates = DictionarySectionFactory.loadFrom(input, iListener); objects = DictionarySectionFactory.loadFrom(input, iListener); } - + @Override public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { ControlInformation ci = new ControlInformation(); @@ -127,13 +142,13 @@ public void mapFromFile(CountInputStream in, File f, ProgressListener listener) if(ci.getType()!=ControlInfo.Type.DICTIONARY) { throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); } - + IntermediateListener iListener = new IntermediateListener(listener); shared = DictionarySectionFactory.loadFrom(in, f, iListener); subjects = DictionarySectionFactory.loadFrom(in, f, iListener); predicates = DictionarySectionFactory.loadFrom(in, f, iListener); objects = DictionarySectionFactory.loadFrom(in, f, iListener); - + // Use cache only for predicates. Preload only up to 100K predicates. // FIXME: DISABLED // predicates = new DictionarySectionCacheAll(predicates, predicates.getNumberOfElements()<100000); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryBig.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryBig.java index e08bc169..c8f79fad 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryBig.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryBig.java @@ -44,6 +44,7 @@ import org.rdfhdt.hdt.options.ControlInfo.Type; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -84,6 +85,19 @@ public void load(TempDictionary other, ProgressListener listener) { shared.load(other.getShared(), iListener); } + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + IntermediateListener iListener = new IntermediateListener(null); + new ExceptionThread(() -> predicates.load(other.getPredicates(), iListener), "FourSecSAsyncReaderP") + .attach( + new ExceptionThread(() -> subjects.load(other.getSubjects(), iListener), "FourSecSAsyncReaderS"), + new ExceptionThread(() -> shared.load(other.getShared(), iListener), "FourSecSAsyncReaderSh"), + new ExceptionThread(() -> objects.load(other.getObjects(), iListener), "FourSecSAsyncReaderO") + ) + .startAll() + .joinAndCrashIfRequired(); + } + /* (non-Javadoc) * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) */ diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java index 9976d9c5..25175178 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java @@ -4,10 +4,12 @@ import org.rdfhdt.hdt.dictionary.DictionaryPrivate; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.dictionary.impl.section.PFCOptimizedExtractor; import org.rdfhdt.hdt.enums.DictionarySectionRole; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.util.LiteralsUtils; import org.rdfhdt.hdt.util.string.CompactString; @@ -323,4 +325,9 @@ public AbstractMap.SimpleEntry getDataTypeRange(String dataType){ } return new AbstractMap.SimpleEntry<>(0L,0L); } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) { + throw new NotImplementedException(); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java index 8199aace..67f4bf69 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java @@ -7,6 +7,7 @@ import org.rdfhdt.hdt.dictionary.impl.section.HashDictionarySection; import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySection; import org.rdfhdt.hdt.exceptions.IllegalFormatException; +import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.header.Header; import org.rdfhdt.hdt.listener.ProgressListener; @@ -49,6 +50,7 @@ public void load(TempDictionary other, ProgressListener listener) { predicates.load(other.getPredicates(), iListener); Iterator iter = other.getObjects().getEntries(); + // TODO: allow the usage of OneReadDictionarySection HashMap literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); if(literalsCounts.containsKey("NO_DATATYPE")) literalsCounts.put("NO_DATATYPE",literalsCounts.get("NO_DATATYPE") - other.getShared().getNumberOfElements()); @@ -229,4 +231,9 @@ public void close() throws IOException { } } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) { + throw new NotImplementedException(); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java new file mode 100644 index 00000000..73549bac --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java @@ -0,0 +1,106 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Path; + +/** + * Version of four section dictionary with {@link org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection} + * @author Antoine Willerval + */ +public class WriteFourSectionDictionary extends BaseDictionary { + public WriteFourSectionDictionary(HDTOptions spec, Path filename, int bufferSize) { + super(spec); + String name = filename.getFileName().toString(); + subjects = new WriteDictionarySection(spec, filename.resolveSibling(name + "SU"), bufferSize); + predicates = new WriteDictionarySection(spec, filename.resolveSibling(name + "PR"), bufferSize); + objects = new WriteDictionarySection(spec, filename.resolveSibling(name + "OB"), bufferSize); + shared = new WriteDictionarySection(spec, filename.resolveSibling(name + "SH"), bufferSize); + } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + MultiThreadListener ml = ListenerUtil.multiThreadListener(listener); + ml.unregisterAllThreads(); + ExceptionThread.async("FourSecSAsyncReader", + () -> predicates.load(other.getPredicates(), new IntermediateListener(ml, "Predicate: ")), + () -> subjects.load(other.getSubjects(), new IntermediateListener(ml, "Subjects: ")), + () -> shared.load(other.getShared(), new IntermediateListener(ml, "Shared: ")), + () -> objects.load(other.getObjects(), new IntermediateListener(ml, "Object: ")) + ) + .startAll() + .joinAndCrashIfRequired(); + ml.unregisterAllThreads(); + } + + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void load(TempDictionary other, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + ci.setType(ControlInfo.Type.DICTIONARY); + ci.setFormat(getType()); + ci.setInt("elements", this.getNumberOfElements()); + ci.save(output); + + IntermediateListener iListener = new IntermediateListener(listener); + iListener.setRange(0, 25); + iListener.setPrefix("Save shared: "); + shared.save(output, iListener); + iListener.setRange(25, 50); + iListener.setPrefix("Save subjects: "); + subjects.save(output, iListener); + iListener.setRange(50, 75); + iListener.setPrefix("Save predicates: "); + predicates.save(output, iListener); + iListener.setRange(75, 100); + iListener.setPrefix("Save objects: "); + objects.save(output, iListener); + } + + @Override + public void populateHeader(Header header, String rootNode) { + header.insert(rootNode, HDTVocabulary.DICTIONARY_TYPE, getType()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSHARED, getNshared()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_SIZE_STRINGS, size()); + } + + @Override + public String getType() { + return HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(shared, subjects, predicates, objects); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java new file mode 100644 index 00000000..1af57850 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java @@ -0,0 +1,76 @@ +package org.rdfhdt.hdt.dictionary.impl.section; + +import org.rdfhdt.hdt.dictionary.TempDictionarySection; +import org.rdfhdt.hdt.exceptions.NotImplementedException; + +import java.io.IOException; +import java.util.Iterator; + +public class OneReadDictionarySection implements TempDictionarySection { + private final Iterator reader; + private final long size; + + public OneReadDictionarySection(Iterator reader, long size) { + this.reader = reader; + this.size = size; + } + + @Override + public long add(CharSequence str) { + throw new NotImplementedException(); + } + + @Override + public void remove(CharSequence str) { + throw new NotImplementedException(); + } + + @Override + public void sort() { + throw new NotImplementedException(); + } + + @Override + public void clear() { + throw new NotImplementedException(); + } + + @Override + public boolean isSorted() { + return true; + } + + @Override + public Iterator getEntries() { + return reader; + } + + @Override + public long locate(CharSequence s) { + throw new NotImplementedException(); + } + + @Override + public CharSequence extract(long pos) { + throw new NotImplementedException(); + } + + @Override + public long size() { + return size; + } + + @Override + public long getNumberOfElements() { + return size; + } + + @Override + public Iterator getSortedEntries() { + return reader; + } + + @Override + public void close() throws IOException { + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java new file mode 100644 index 00000000..a4b197ef --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java @@ -0,0 +1,155 @@ +package org.rdfhdt.hdt.dictionary.impl.section; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.compact.sequence.SequenceLog64BigDisk; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionarySection; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRC8; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.CountOutputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.ListenerUtil; +import org.rdfhdt.hdt.util.string.ByteStringUtil; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; + +/** + * Implementation of {@link org.rdfhdt.hdt.dictionary.DictionarySectionPrivate} that write loaded + * {@link org.rdfhdt.hdt.dictionary.TempDictionarySection} on disk before saving, reducing the size in ram + * + * @author Antoine Willerval + */ +public class WriteDictionarySection implements DictionarySectionPrivate { + private final CloseSuppressPath tempFilename; + private final CloseSuppressPath blockTempFilename; + private SequenceLog64BigDisk blocks; + private final long blockSize; + private final int bufferSize; + private long numberElements = 0; + private long byteoutSize; + + public WriteDictionarySection(HDTOptions spec, Path filename, int bufferSize) { + this.bufferSize = bufferSize; + String fn = filename.getFileName().toString(); + tempFilename = CloseSuppressPath.of(filename.resolveSibling(fn + "_temp")); + blockTempFilename = CloseSuppressPath.of(filename.resolveSibling(fn + "_tempblock")); + long blockSize = spec.getInt("pfc.blocksize"); + if (blockSize < 0) { + throw new IllegalArgumentException("negative pfc.blocksize"); + } else if (blockSize == 0) { + this.blockSize = PFCDictionarySection.DEFAULT_BLOCK_SIZE; + } else { + this.blockSize = blockSize; + } + } + + @Override + public void load(TempDictionarySection other, ProgressListener plistener) { + MultiThreadListener listener = ListenerUtil.multiThreadListener(plistener); + long otherN = other.getNumberOfElements(); + long block = otherN < 10 ? 1 : otherN / 10; + long currentCount = 0; + blocks = new SequenceLog64BigDisk(blockTempFilename.toAbsolutePath().toString(), 64, otherN / blockSize); + + listener.notifyProgress(0, "Filling section"); + try (CountOutputStream out = new CountOutputStream(tempFilename.openOutputStream(bufferSize))) { + CRCOutputStream crcout = new CRCOutputStream(out, new CRC32()); + String previousStr = null; + for (Iterator it = other.getSortedEntries(); it.hasNext(); currentCount++) { + CharSequence sec = it.next(); + String str = sec.toString(); + if (numberElements % blockSize == 0) { + blocks.append(out.getTotalBytes()); + + // Copy full string + ByteStringUtil.append(out, str, 0); + } else { + // Find common part. + int delta = ByteStringUtil.longestCommonPrefix(previousStr, str); + // Write Delta in VByte + VByte.encode(out, delta); + // Write remaining + ByteStringUtil.append(out, str, delta); + } + out.write(0); + previousStr = str; + numberElements++; + if (currentCount % block == 0) { + listener.notifyProgress((float) (currentCount * 100 / otherN), "Filling section"); + } + } + + byteoutSize = out.getTotalBytes(); + crcout.writeCRC(); + } catch (IOException e) { + throw new RuntimeException("can't load section", e); + } + blocks.append(byteoutSize); + // Trim text/blocks + blocks.aggressiveTrimToSize(); + if (numberElements % 100_000 == 0) { + listener.notifyProgress(100, "Completed section filling"); + } + } + + @Override + public void save(OutputStream output, ProgressListener listener) throws IOException { + CRCOutputStream out = new CRCOutputStream(output, new CRC8()); + out.write(PFCDictionarySection.TYPE_INDEX); + VByte.encode(out, numberElements); + + VByte.encode(out, byteoutSize); + VByte.encode(out, blockSize); + out.writeCRC(); + // Write blocks directly to output, they have their own CRC check. + blocks.save(output, listener); + // Write blocks data directly to output, the load was writing using a CRC check. + Files.copy(tempFilename.getJavaPath(), output); + } + + @Override + public void load(InputStream input, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public long locate(CharSequence s) { + throw new NotImplementedException(); + } + + @Override + public CharSequence extract(long pos) { + throw new NotImplementedException(); + } + + @Override + public long size() { + return numberElements; + } + + @Override + public long getNumberOfElements() { + return numberElements; + } + + @Override + public Iterator getSortedEntries() { + throw new NotImplementedException(); + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(blocks, tempFilename, blockTempFilename); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 7dd0be6c..30598b6b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -1,26 +1,53 @@ package org.rdfhdt.hdt.hdt; import org.rdfhdt.hdt.compact.bitmap.Bitmap; +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.dictionary.impl.CompressFourSectionDictionary; import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.enums.TripleComponentOrder; import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.impl.HDTBase; import org.rdfhdt.hdt.hdt.impl.HDTImpl; import org.rdfhdt.hdt.hdt.impl.TempHDTImporterOnePass; import org.rdfhdt.hdt.hdt.impl.TempHDTImporterTwoPass; +import org.rdfhdt.hdt.hdt.impl.WriteHDTImpl; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressTripleMapper; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.hdt.impl.diskimport.SectionCompressor; +import org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult; import org.rdfhdt.hdt.hdt.writer.TripleWriterHDT; +import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.header.HeaderUtil; +import org.rdfhdt.hdt.iterator.utils.*; +import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.iterator.utils.FluxStopTripleStringIterator; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.rdf.RDFFluxStop; import org.rdfhdt.hdt.rdf.RDFParserCallback; import org.rdfhdt.hdt.rdf.RDFParserFactory; import org.rdfhdt.hdt.rdf.TripleWriter; +import org.rdfhdt.hdt.triples.TempTriples; import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.TriplesPrivate; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.Profiler; +import org.rdfhdt.hdt.util.StringUtil; +import org.rdfhdt.hdt.util.concurrent.KWayMerger; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.MapCompressTripleMerger; +import org.rdfhdt.hdt.util.io.compress.TripleGenerator; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; import org.rdfhdt.hdt.util.listener.PrefixListener; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; @@ -34,9 +61,10 @@ import java.util.Optional; public class HDTManagerImpl extends HDTManager { + private static final Logger logger = LoggerFactory.getLogger(HDTManagerImpl.class); private boolean useSimple(HDTOptions spec) { - String value = spec.get("parser.ntSimpleParser"); + String value = spec.get(HDTOptionsKeys.NT_SIMPLE_PARSER_KEY); return value != null && !value.isEmpty() && !value.equals("false"); } @@ -51,7 +79,7 @@ public HDT doLoadHDT(String hdtFileName, ProgressListener listener, HDTOptions s hdt.loadFromHDT(hdtFileName, listener); return hdt; } - + @Override protected HDT doMapHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException { HDTPrivate hdt = new HDTImpl(spec); @@ -74,7 +102,6 @@ public HDT doLoadIndexedHDT(String hdtFileName, ProgressListener listener, HDTOp hdt.loadOrCreateIndex(listener); return hdt; } - @Override @@ -95,21 +122,25 @@ public HDT doLoadIndexedHDT(InputStream hdtFile, ProgressListener listener, HDTO @Override public HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException { - ((HDTPrivate)hdt).loadOrCreateIndex(listener); + ((HDTPrivate) hdt).loadOrCreateIndex(listener); return hdt; } @Override public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions spec, ProgressListener listener) throws IOException, ParserException { //choose the importer - String loaderType = spec.get("loader.type"); + String loaderType = spec.get(HDTOptionsKeys.LOADER_TYPE_KEY); TempHDTImporter loader; - if ("two-pass".equals(loaderType)) { + if (HDTOptionsKeys.LOADER_TYPE_VALUE_TWO_PASS.equals(loaderType)) { loader = new TempHDTImporterTwoPass(useSimple(spec)); } else { + if (!HDTOptionsKeys.LOADER_TYPE_VALUE_ONE_PASS.equals(loaderType)) { + logger.warn("Used the option {} with value {}, which isn't recognize, using default value {}", + HDTOptionsKeys.LOADER_TYPE_KEY, loaderType, HDTOptionsKeys.LOADER_TYPE_VALUE_ONE_PASS); + } loader = new TempHDTImporterOnePass(useSimple(spec)); } - + // Create TempHDT try (TempHDT modHdt = loader.loadFromRDF(spec, rdfFileName, baseURI, rdfNotation, listener)) { @@ -130,6 +161,18 @@ public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNota } } + @Override + public HDT doGenerateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException { + // uncompress the stream if required + fileStream = IOUtil.asUncompressed(fileStream, compressionType); + // create a parser for this rdf stream + RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation); + // read the stream as triples + Iterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation); + + return doGenerateHDT(iterator, baseURI, hdtFormat, listener); + } + @Override public HDT doGenerateHDT(Iterator triples, String baseURI, HDTOptions spec, ProgressListener listener) throws IOException { //choose the importer @@ -155,7 +198,230 @@ public HDT doGenerateHDT(Iterator triples, String baseURI, HDTOpti } @Override - protected TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException { + public HDT doGenerateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + // read this file as stream, do not compress to allow the compressionType to be different from the file extension + try (InputStream stream = IOUtil.getFileInputStream(rdfFileName, false)) { + return doGenerateHDTDisk(stream, baseURI, rdfNotation, compressionType, hdtFormat, listener); + } + } + + @Override + public HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + // uncompress the stream if required + fileStream = IOUtil.asUncompressed(fileStream, compressionType); + // create a parser for this rdf stream + RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation, useSimple(hdtFormat)); + // read the stream as triples + Iterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation); + + return doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener); + } + + /** + * @return a theoretical maximum amount of memory the JVM will attempt to use + */ + static long getMaxChunkSize(int workers) { + Runtime runtime = Runtime.getRuntime(); + return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85 / (1.5 * 3 * workers)); + } + + @Override + public HDT doGenerateHDTDisk(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener progressListener) throws IOException, ParserException { + MultiThreadListener listener = ListenerUtil.multiThreadListener(progressListener); + // load config + // compression mode + String compressMode = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY); // see CompressionResult + // worker for compression tasks + int workers = (int) hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY); + // maximum size of a chunk + long chunkSize = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY); + long maxFileOpenedLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_MAX_FILE_OPEN_KEY); + long kwayLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_KWAY_KEY); + long bufferSizeLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_BUFFER_SIZE_KEY); + int maxFileOpened; + int ways; + int bufferSize; + // location of the working directory, will be deleted after generation + String baseNameOpt = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY); + CloseSuppressPath basePath; + // location of the future HDT file, do not set to create the HDT in memory while mergin + String futureHDTLocation = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY); + + Profiler profiler = new Profiler("doGenerateHDTDisk"); + String profilerString = hdtFormat.get(HDTOptionsKeys.PROFILER_KEY); + profiler.setDisabled(profilerString == null || !profilerString.equalsIgnoreCase("true")); + // check and set default values if required + if (workers == 0) { + workers = Runtime.getRuntime().availableProcessors(); + } else if (workers < 0) { + throw new IllegalArgumentException("Negative number of workers!"); + } + if (baseNameOpt == null || baseNameOpt.isEmpty()) { + basePath = CloseSuppressPath.of(Files.createTempDirectory("hdt-java-generate-disk")); + } else { + basePath = CloseSuppressPath.of(baseNameOpt); + } + basePath.closeWithDeleteRecurse(); + if (chunkSize == 0) { + chunkSize = getMaxChunkSize(workers); + } else if (chunkSize < 0) { + throw new IllegalArgumentException("Negative chunk size!"); + } + if (bufferSizeLong > Integer.MAX_VALUE - 5L || bufferSizeLong < 0) { + throw new IllegalArgumentException("Buffer size can't be negative or bigger than the size of an array!"); + } else if (bufferSizeLong == 0) { + bufferSize = CloseSuppressPath.BUFFER_SIZE; + } else { + bufferSize = (int) bufferSizeLong; + } + if (maxFileOpenedLong < 0 || maxFileOpenedLong > Integer.MAX_VALUE) { + throw new IllegalArgumentException("maxFileOpened can't be negative!"); + } else if (maxFileOpenedLong == 0) { + maxFileOpened = 1024; + } else { + maxFileOpened = (int) maxFileOpenedLong; + } + if (kwayLong < 0 || kwayLong > Integer.MAX_VALUE) { + throw new IllegalArgumentException("kway can't be negative!"); + } else if (kwayLong == 0) { + ways = Math.max(1, BitUtil.log2(maxFileOpened / workers)); + } else { + ways = (int) kwayLong; + } + boolean mapHDT = futureHDTLocation != null && !futureHDTLocation.isEmpty(); + + // create working directory + basePath.mkdirs(); + try { + // compress the triples into sections and compressed triples + listener.notifyProgress(0, "Sorting sections with chunk of size: " + StringUtil.humanReadableByteCount(chunkSize, true) + "B with " + ways + "ways and " + workers + " worker(s)"); + + AsyncIteratorFetcher source = new AsyncIteratorFetcher<>(iterator); + + profiler.pushSection("section compression"); + CompressionResult compressionResult; + try { + compressionResult = new SectionCompressor( + basePath.resolve("sectionCompression"), + source, + listener, + bufferSize, + chunkSize, 1 << ways + ).compress(workers, compressMode); + } catch (KWayMerger.KWayMergerException | InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + + HDTBase hdt; + if (!mapHDT) { + // using default implementation + hdt = new HDTImpl(hdtFormat); + } else { + // using map implementation + hdt = new WriteHDTImpl(hdtFormat, basePath.resolve("maphdt"), bufferSize); + } + hdt.setBaseUri(baseURI); + + listener.unregisterAllThreads(); + listener.notifyProgress(20, "Create sections and triple mapping"); + + profiler.pushSection("dictionary write"); + // create sections and triple mapping + DictionaryPrivate dictionary = hdt.getDictionary(); + CompressTripleMapper mapper = new CompressTripleMapper(basePath, compressionResult.getTripleCount(), chunkSize); + CompressFourSectionDictionary modifiableDictionary = new CompressFourSectionDictionary(compressionResult, mapper, listener); + try { + dictionary.loadAsync(modifiableDictionary, listener); + } catch (InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + + // complete the mapper with the shared count and delete compression data + compressionResult.delete(); + mapper.setShared(dictionary.getNshared()); + + listener.notifyProgress(40, "Create mapped and sort triple file"); + // create mapped triples file + TripleCompressionResult tripleCompressionResult; + TriplesPrivate triples = hdt.getTriples(); + TripleComponentOrder order = triples.getOrder(); + profiler.pushSection("triple compression/map"); + try { + MapCompressTripleMerger tripleMapper = new MapCompressTripleMerger( + basePath.resolve("tripleMapper"), + new AsyncIteratorFetcher<>(new TripleGenerator(compressionResult.getTripleCount())), + mapper, + listener, + order, + bufferSize, + chunkSize, + 1 << ways); + tripleCompressionResult = tripleMapper.merge(workers, compressMode); + } catch (KWayMerger.KWayMergerException | InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + listener.unregisterAllThreads(); + + profiler.pushSection("bit triple creation"); + try { + // create bit triples and load the triples + TempTriples tempTriples = tripleCompressionResult.getTriples(); + IntermediateListener il = new IntermediateListener(listener); + il.setRange(80, 90); + il.setPrefix("Create bit triples: "); + il.notifyProgress(0, "create triples"); + triples.load(tempTriples, il); + tempTriples.close(); + + // completed the triples, delete the mapper + mapper.delete(); + } finally { + tripleCompressionResult.close(); + } + profiler.popSection(); + profiler.pushSection("header creation"); + + listener.notifyProgress(90, "Create HDT header"); + // header + hdt.populateHeaderStructure(hdt.getBaseURI()); + hdt.getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, compressionResult.getRawSize()); + + profiler.popSection(); + // return the HDT + if (mapHDT) { + profiler.pushSection("map to hdt"); + // write the HDT and map it + try { + hdt.saveToHDT(futureHDTLocation, listener); + } finally { + hdt.close(); + } + IntermediateListener il = new IntermediateListener(listener); + il.setPrefix("Map HDT: "); + il.setRange(95, 100); + il.notifyProgress(0, "start"); + try { + return doMapHDT(futureHDTLocation, il, hdtFormat); + } finally { + profiler.popSection(); + } + } else { + listener.notifyProgress(100, "HDT completed"); + return hdt; + } + } finally { + profiler.stop(); + profiler.writeProfiling(); + listener.notifyProgress(100, "Clearing disk"); + basePath.close(); + } + } + + @Override + protected TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) { return new TripleWriterHDT(baseURI, hdtFormat, out); } @@ -216,7 +482,7 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStre @Override protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { Path basePath; - String baseNameOpt = hdtFormat.get("loader.cattree.location"); + String baseNameOpt = hdtFormat.get(HDTOptionsKeys.LOADER_CATTREE_LOCATION_KEY); if (baseNameOpt == null || baseNameOpt.isEmpty()) { basePath = Files.createTempDirectory("hdt-java-cat-tree"); @@ -224,7 +490,11 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< basePath = Path.of(baseNameOpt); } - Path futureHDTLocation = Optional.ofNullable(hdtFormat.get("loader.cattree.futureHDTLocation")).map(Path::of).orElse(null); + Path futureHDTLocation = Optional.ofNullable(hdtFormat.get(HDTOptionsKeys.LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY)).map(Path::of).orElse(null); + + Profiler profiler = new Profiler("doHDTCatTree"); + String profilerString = hdtFormat.get(HDTOptionsKeys.PROFILER_KEY); + profiler.setDisabled(profilerString == null || !profilerString.equalsIgnoreCase("true")); FluxStopTripleStringIterator it = new FluxStopTripleStringIterator(iterator, fluxStop); @@ -244,17 +514,20 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< do { // generate the hdt gen++; + profiler.pushSection("generateHDT #" + gen); ProgressListener il = PrefixListener.of("gen#" + gen, listener); Path hdtLocation = hdtStore.resolve("hdt-" + gen + ".hdt"); supplier.doGenerateHDT(it, baseURI, hdtFormat, il, hdtLocation); nextFile = it.hasNextFlux(); HDTFile hdtFile = new HDTFile(hdtLocation, 1); + profiler.popSection(); // merge the generated hdt with each block with enough size while (!files.isEmpty() && (!nextFile || (files.get(files.size() - 1)).getChunks() <= hdtFile.getChunks())) { HDTFile lastHDTFile = files.remove(files.size() - 1); cat++; + profiler.pushSection("catHDT #" + cat); ProgressListener ilc = PrefixListener.of("cat#" + cat, listener); Path hdtCatFileLocation = hdtStore.resolve("hdtcat-" + cat + ".hdt"); try (HDT abcat = HDTManager.catHDT( @@ -269,12 +542,12 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< Files.delete(hdtFile.getHdtFile()); // note the new hdt file and the number of chunks hdtFile = new HDTFile(hdtCatFileLocation, lastHDTFile.getChunks() + hdtFile.getChunks()); + + profiler.popSection(); } files.add(hdtFile); } while (nextFile); - assert files.size() == 1; - Path hdtFile = files.get(0).hdtFile; assert files.get(0).getChunks() == gen; @@ -292,6 +565,8 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< return HDTManager.loadHDT(hdtFile.toAbsolutePath().toString()); } finally { Files.delete(hdtFile); + profiler.stop(); + profiler.writeProfiling(); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java new file mode 100644 index 00000000..fe1baeab --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java @@ -0,0 +1,166 @@ +package org.rdfhdt.hdt.hdt.impl; + +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.hdt.HDTPrivate; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.ControlInformation; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.triples.TriplesPrivate; +import org.rdfhdt.hdt.util.StringUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Date; + +/** + * Abstract hdt base for {@link org.rdfhdt.hdt.hdt.HDTPrivate} + * + * @param header type + * @param dictionary type + * @param triple type + */ +public abstract class HDTBase implements HDTPrivate { + protected final HDTOptions spec; + protected H header; + protected D dictionary; + protected T triples; + + protected HDTBase(HDTOptions spec) { + if (spec == null) { + this.spec = new HDTSpecification(); + } else { + this.spec = spec; + } + } + + /** + * set the base URI of the hdt + * + * @param baseURI base uri + */ + public abstract void setBaseUri(String baseURI); + + /** + * @return if the HDT is closed + */ + public abstract boolean isClosed(); + + /* + * (non-Javadoc) + * + * @see hdt.HDT#getHeader() + */ + @Override + public H getHeader() { + return header; + } + + /* + * (non-Javadoc) + * + * @see hdt.HDT#getDictionary() + */ + @Override + public D getDictionary() { + return dictionary; + } + + /* + * (non-Javadoc) + * + * @see hdt.HDT#getTriples() + */ + @Override + public T getTriples() { + return triples; + } + + /* (non-Javadoc) + * @see hdt.hdt.HDT#getSize() + */ + @Override + public long size() { + if (isClosed()) + return 0; + + return dictionary.size() + triples.size(); + } + + /* + * (non-Javadoc) + * + * @see hdt.HDT#saveToHDT(java.io.OutputStream) + */ + @Override + public void saveToHDT(OutputStream output, ProgressListener listener) throws IOException { + ControlInfo ci = new ControlInformation(); + IntermediateListener iListener = new IntermediateListener(listener); + + ci.clear(); + ci.setType(ControlInfo.Type.GLOBAL); + ci.setFormat(HDTVocabulary.HDT_CONTAINER); + ci.save(output); + + ci.clear(); + ci.setType(ControlInfo.Type.HEADER); + header.save(output, ci, iListener); + + ci.clear(); + ci.setType(ControlInfo.Type.DICTIONARY); + dictionary.save(output, ci, iListener); + + ci.clear(); + ci.setType(ControlInfo.Type.TRIPLES); + triples.save(output, ci, iListener); + } + + @Override + public void populateHeaderStructure(String baseUri) { + if (baseUri == null || baseUri.length() == 0) { + throw new IllegalArgumentException("baseURI cannot be empty"); + } + + if (isClosed()) { + throw new IllegalStateException("Cannot add header to a closed HDT."); + } + + H header = getHeader(); + D dictionary = getDictionary(); + T triples = getTriples(); + header.insert(baseUri, HDTVocabulary.RDF_TYPE, HDTVocabulary.HDT_DATASET); + header.insert(baseUri, HDTVocabulary.RDF_TYPE, HDTVocabulary.VOID_DATASET); + + // VOID + header.insert(baseUri, HDTVocabulary.VOID_TRIPLES, triples.getNumberOfElements()); + header.insert(baseUri, HDTVocabulary.VOID_PROPERTIES, dictionary.getNpredicates()); + header.insert(baseUri, HDTVocabulary.VOID_DISTINCT_SUBJECTS, dictionary.getNsubjects()); + header.insert(baseUri, HDTVocabulary.VOID_DISTINCT_OBJECTS, dictionary.getNobjects()); + + // Structure + String formatNode = "_:format"; + String dictNode = "_:dictionary"; + String triplesNode = "_:triples"; + String statisticsNode = "_:statistics"; + String publicationInfoNode = "_:publicationInformation"; + + header.insert(baseUri, HDTVocabulary.HDT_FORMAT_INFORMATION, formatNode); + header.insert(formatNode, HDTVocabulary.HDT_DICTIONARY, dictNode); + header.insert(formatNode, HDTVocabulary.HDT_TRIPLES, triplesNode); + header.insert(baseUri, HDTVocabulary.HDT_STATISTICAL_INFORMATION, statisticsNode); + header.insert(baseUri, HDTVocabulary.HDT_PUBLICATION_INFORMATION, publicationInfoNode); + + dictionary.populateHeader(header, dictNode); + triples.populateHeader(header, triplesNode); + + header.insert(statisticsNode, HDTVocabulary.HDT_SIZE, getDictionary().size() + getTriples().size()); + + // Current time + header.insert(publicationInfoNode, HDTVocabulary.DUBLIN_CORE_ISSUED, StringUtil.formatDate(new Date())); + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java index f092cbbf..458ffdbd 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java @@ -31,7 +31,6 @@ import org.rdfhdt.hdt.compact.bitmap.Bitmap; import org.rdfhdt.hdt.compact.bitmap.BitmapFactory; import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; -import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionaryCat; import org.rdfhdt.hdt.dictionary.DictionaryDiff; import org.rdfhdt.hdt.dictionary.DictionaryFactory; @@ -50,11 +49,9 @@ import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.hdt.HDT; -import org.rdfhdt.hdt.hdt.HDTPrivate; import org.rdfhdt.hdt.hdt.HDTVersion; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.hdt.TempHDT; -import org.rdfhdt.hdt.header.Header; import org.rdfhdt.hdt.header.HeaderFactory; import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.iterator.DictionaryTranslateIterator; @@ -70,7 +67,6 @@ import org.rdfhdt.hdt.triples.TempTriples; import org.rdfhdt.hdt.triples.TripleID; import org.rdfhdt.hdt.triples.TripleString; -import org.rdfhdt.hdt.triples.Triples; import org.rdfhdt.hdt.triples.TriplesFactory; import org.rdfhdt.hdt.triples.TriplesPrivate; import org.rdfhdt.hdt.triples.impl.BitmapTriples; @@ -79,7 +75,6 @@ import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorDiff; import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorMapDiff; import org.rdfhdt.hdt.util.StopWatch; -import org.rdfhdt.hdt.util.StringUtil; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -97,7 +92,6 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.Date; import java.util.Iterator; import java.util.Map; import java.util.zip.GZIPInputStream; @@ -106,75 +100,20 @@ * Basic implementation of HDT interface * */ -public class HDTImpl implements HDTPrivate { +public class HDTImpl extends HDTBase { private static final Logger log = LoggerFactory.getLogger(HDTImpl.class); - private final HDTOptions spec; - - protected HeaderPrivate header; - protected DictionaryPrivate dictionary; - protected TriplesPrivate triples; - private String hdtFileName; private String baseUri; private boolean isMapped; private boolean isClosed=false; - private void createComponents() { - header = HeaderFactory.createHeader(spec); - dictionary = DictionaryFactory.createDictionary(spec); - triples = TriplesFactory.createTriples(spec); - } - - @Override - public void populateHeaderStructure(String baseUri) { - if(baseUri==null || baseUri.length()==0) { - throw new IllegalArgumentException("baseURI cannot be empty"); - } - - if(isClosed) { - throw new IllegalStateException("Cannot add header to a closed HDT."); - } - - header.insert(baseUri, HDTVocabulary.RDF_TYPE, HDTVocabulary.HDT_DATASET); - header.insert(baseUri, HDTVocabulary.RDF_TYPE, HDTVocabulary.VOID_DATASET); - - // VOID - header.insert(baseUri, HDTVocabulary.VOID_TRIPLES, triples.getNumberOfElements()); - header.insert(baseUri, HDTVocabulary.VOID_PROPERTIES, dictionary.getNpredicates()); - header.insert(baseUri, HDTVocabulary.VOID_DISTINCT_SUBJECTS, dictionary.getNsubjects()); - header.insert(baseUri, HDTVocabulary.VOID_DISTINCT_OBJECTS, dictionary.getNobjects()); - - // Structure - String formatNode = "_:format"; - String dictNode = "_:dictionary"; - String triplesNode = "_:triples"; - String statisticsNode = "_:statistics"; - String publicationInfoNode = "_:publicationInformation"; - - header.insert(baseUri, HDTVocabulary.HDT_FORMAT_INFORMATION, formatNode); - header.insert(formatNode, HDTVocabulary.HDT_DICTIONARY, dictNode); - header.insert(formatNode, HDTVocabulary.HDT_TRIPLES, triplesNode); - header.insert(baseUri, HDTVocabulary.HDT_STATISTICAL_INFORMATION, statisticsNode); - header.insert(baseUri, HDTVocabulary.HDT_PUBLICATION_INFORMATION, publicationInfoNode); - - dictionary.populateHeader(header, dictNode); - triples.populateHeader(header, triplesNode); - - header.insert(statisticsNode, HDTVocabulary.HDT_SIZE, getDictionary().size()+getTriples().size()); - - // Current time - header.insert(publicationInfoNode, HDTVocabulary.DUBLIN_CORE_ISSUED, StringUtil.formatDate(new Date())); - } - public HDTImpl(HDTOptions spec) { - if (spec == null) { - this.spec = new HDTSpecification(); - } else { - this.spec = spec; - } + super(spec); - createComponents(); + header = HeaderFactory.createHeader(this.spec); + dictionary = DictionaryFactory.createDictionary(this.spec); + triples = TriplesFactory.createTriples(this.spec); } @Override @@ -198,14 +137,7 @@ public void loadFromHDT(InputStream input, ProgressListener listener) throws IOE header.load(input, ci, iListener); // Set base URI. - try { - IteratorTripleString it = header.search("", HDTVocabulary.RDF_TYPE, HDTVocabulary.HDT_DATASET); - if(it.hasNext()) { - this.baseUri = it.next().getSubject().toString(); - } - } catch (NotFoundException e) { - log.error("Unexpected exception.", e); - } + this.baseUri = header.getBaseURI().toString(); // Load dictionary ci.clear(); @@ -310,34 +242,6 @@ public void mapFromHDT(File f, long offset, ProgressListener listener) throws IO isClosed=false; } - /* - * (non-Javadoc) - * - * @see hdt.HDT#saveToHDT(java.io.OutputStream) - */ - @Override - public void saveToHDT(OutputStream output, ProgressListener listener) throws IOException { - ControlInfo ci = new ControlInformation(); - IntermediateListener iListener = new IntermediateListener(listener); - - ci.clear(); - ci.setType(ControlInfo.Type.GLOBAL); - ci.setFormat(HDTVocabulary.HDT_CONTAINER); - ci.save(output); - - ci.clear(); - ci.setType(ControlInfo.Type.HEADER); - header.save(output, ci, iListener); - - ci.clear(); - ci.setType(ControlInfo.Type.DICTIONARY); - dictionary.save(output, ci, iListener); - - ci.clear(); - ci.setType(ControlInfo.Type.TRIPLES); - triples.save(output, ci, iListener); - } - /* * (non-Javadoc) * @@ -415,52 +319,16 @@ public long getLastTriplePosition() { } } - /* - * (non-Javadoc) - * - * @see hdt.HDT#getHeader() - */ - @Override - public Header getHeader() { - return header; - } - - /* - * (non-Javadoc) - * - * @see hdt.HDT#getDictionary() - */ - @Override - public Dictionary getDictionary() { - return dictionary; - } - - /* - * (non-Javadoc) - * - * @see hdt.HDT#getTriples() - */ - @Override - public Triples getTriples() { - return triples; - } - - /* (non-Javadoc) - * @see hdt.hdt.HDT#getSize() - */ - @Override - public long size() { - if(isClosed) - return 0; - - return dictionary.size()+triples.size(); - } - public void loadFromParts(HeaderPrivate h, DictionaryPrivate d, TriplesPrivate t) { this.header = h; this.dictionary = d; this.triples = t; - isClosed=false; + isClosed=false; + } + + @Override + public void setBaseUri(String baseUri) { + this.baseUri = baseUri; } public void loadFromModifiableHDT(TempHDT modHdt, ProgressListener listener) { @@ -469,8 +337,8 @@ public void loadFromModifiableHDT(TempHDT modHdt, ProgressListener listener) { modHdt.reorganizeTriples(listener); // Get parts - TempTriples modifiableTriples = (TempTriples) modHdt.getTriples(); - TempDictionary modifiableDictionary = (TempDictionary) modHdt.getDictionary(); + TempTriples modifiableTriples = modHdt.getTriples(); + TempDictionary modifiableDictionary = modHdt.getDictionary(); // Convert triples to final format if(triples.getClass().equals(modifiableTriples.getClass())) { @@ -591,6 +459,7 @@ public String getHDTFileName() { return hdtFileName; } + @Override public boolean isClosed() { return isClosed; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java new file mode 100644 index 00000000..a1f13170 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java @@ -0,0 +1,111 @@ +package org.rdfhdt.hdt.hdt.impl; + +import org.rdfhdt.hdt.dictionary.impl.WriteFourSectionDictionary; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.header.HeaderFactory; +import org.rdfhdt.hdt.header.HeaderPrivate; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.impl.WriteBitmapTriples; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * HDT implementation to write on disk the components + * + * @author Antoine Willerval + */ +public class WriteHDTImpl extends HDTBase { + private String baseURI; + private final CloseSuppressPath workingLocation; + private boolean isClosed; + + public WriteHDTImpl(HDTOptions spec, CloseSuppressPath workingLocation, int bufferSize) throws IOException { + super(spec); + this.workingLocation = workingLocation; + workingLocation.mkdirs(); + + dictionary = new WriteFourSectionDictionary(this.spec, workingLocation.resolve("section"), bufferSize); + // we need to have the bitmaps in memory, so we can't bypass the implementation + triples = new WriteBitmapTriples(this.spec, workingLocation.resolve("tripleBitmap"), bufferSize); + // small, can use default implementation + header = HeaderFactory.createHeader(this.spec); + } + + @Override + public void setBaseUri(String baseURI) { + this.baseURI = baseURI; + } + + @Override + public void loadFromHDT(InputStream input, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void loadFromHDT(String fileName, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void mapFromHDT(File f, long offset, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void loadOrCreateIndex(ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void saveToHDT(String fileName, ProgressListener listener) throws IOException { + try (OutputStream out = new BufferedOutputStream(Files.newOutputStream(Path.of(fileName)))) { + saveToHDT(out, listener); + } + } + + @Override + public long size() { + if (isClosed) + return 0; + + return getDictionary().size() + getTriples().size(); + } + + @Override + public String getBaseURI() { + return baseURI; + } + + @Override + public boolean isClosed() { + return isClosed; + } + + @Override + public void close() throws IOException { + if (isClosed()) { + return; + } + isClosed = true; + IOUtil.closeAll( + dictionary, + triples, + workingLocation + ); + } + + @Override + public IteratorTripleString search(CharSequence subject, CharSequence predicate, CharSequence object) { + throw new NotImplementedException(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java new file mode 100644 index 00000000..d8c4506d --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java @@ -0,0 +1,134 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.compact.sequence.SequenceLog64BigDisk; +import org.rdfhdt.hdt.dictionary.impl.CompressFourSectionDictionary; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.disk.LongArray; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressUtil; +import org.rdfhdt.hdt.util.io.compress.WriteLongArrayBuffer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +/** + * Map a compress triple file to long array map files + * + * @author Antoine Willerval + */ +public class CompressTripleMapper implements CompressFourSectionDictionary.NodeConsumer { + private static final Logger log = LoggerFactory.getLogger(CompressTripleMapper.class); + private final WriteLongArrayBuffer subjects; + private final WriteLongArrayBuffer predicates; + private final WriteLongArrayBuffer objects; + private final CloseSuppressPath locationSubjects; + private final CloseSuppressPath locationPredicates; + private final CloseSuppressPath locationObjects; + private long shared = -1; + + public CompressTripleMapper(CloseSuppressPath location, long tripleCount, long chunkSize) { + locationSubjects = location.resolve("map_subjects"); + locationPredicates = location.resolve("map_predicates"); + locationObjects = location.resolve("map_objects"); + int numbits = BitUtil.log2(tripleCount + 2) + CompressUtil.INDEX_SHIFT; + int maxElement = (int) Math.min(chunkSize / Long.BYTES / 3, Integer.MAX_VALUE - 5); + subjects = + new WriteLongArrayBuffer( + new SequenceLog64BigDisk(locationSubjects.toAbsolutePath().toString(), numbits, tripleCount + 2, true), + tripleCount, maxElement); + predicates = + new WriteLongArrayBuffer(new SequenceLog64BigDisk(locationPredicates.toAbsolutePath().toString(), numbits, tripleCount + 2, true), + tripleCount, maxElement); + objects = + new WriteLongArrayBuffer(new SequenceLog64BigDisk(locationObjects.toAbsolutePath().toString(), numbits, tripleCount + 2, true), + tripleCount, maxElement); + } + + /** + * delete the map files and the location files + */ + public void delete() { + try { + IOUtil.closeAll(subjects, predicates, objects); + } catch (IOException e) { + log.warn("Can't close triple map array", e); + } + try { + IOUtil.closeAll(locationSubjects, locationPredicates, locationObjects); + } catch (IOException e) { + log.warn("Can't delete triple map array files", e); + } + } + + @Override + public void onSubject(long preMapId, long newMapId) { + assert preMapId > 0; + assert newMapId >= CompressUtil.getHeaderId(1); + subjects.set(preMapId, newMapId); + } + + @Override + public void onPredicate(long preMapId, long newMapId) { + assert preMapId > 0; + assert newMapId >= CompressUtil.getHeaderId(1); + predicates.set(preMapId, newMapId); + } + + @Override + public void onObject(long preMapId, long newMapId) { + assert preMapId > 0; + assert newMapId >= CompressUtil.getHeaderId(1); + objects.set(preMapId, newMapId); + } + + public void setShared(long shared) { + this.shared = shared; + subjects.free(); + predicates.free(); + objects.free(); + } + + private void checkShared() { + if (this.shared < 0) { + throw new IllegalArgumentException("Shared not set!"); + } + } + + /** + * extract the map id of a subject + * + * @param id id + * @return new id + */ + public long extractSubject(long id) { + return extract(subjects, id); + } + + /** + * extract the map id of a predicate + * + * @param id id + * @return new id + */ + public long extractPredicate(long id) { + return extract(predicates, id) - shared; + } + + /** + * extract the map id of a object + * + * @param id id + * @return new id + */ + public long extractObjects(long id) { + return extract(objects, id); + } + + private long extract(LongArray array, long id) { + checkShared(); + // compute shared if required + return CompressUtil.computeSharedNode(array.get(id), shared); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java new file mode 100644 index 00000000..e8887b40 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java @@ -0,0 +1,68 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; + +import java.io.Closeable; +import java.io.IOException; + +/** + * Result for the {@link org.rdfhdt.hdt.hdt.impl.diskimport.SectionCompressor} + * @author Antoine Willerval + */ +public interface CompressionResult extends Closeable { + /** + * partial mode for config + * @see org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResultPartial + */ + String COMPRESSION_MODE_PARTIAL = HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL; + /** + * complete mode for config + * @see org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResultFile + */ + String COMPRESSION_MODE_COMPLETE = HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE; + + /** + * @return the number of triple + */ + long getTripleCount(); + /** + * @return a sorted iterator of subject + */ + ExceptionIterator getSubjects(); + /** + * @return a sorted iterator of predicates + */ + ExceptionIterator getPredicates(); + /** + * @return a sorted iterator of objects + */ + ExceptionIterator getObjects(); + /** + * @return the count of subjects + */ + long getSubjectsCount(); + /** + * @return the count of predicates + */ + long getPredicatesCount(); + /** + * @return the count of objects + */ + long getObjectsCount(); + /** + * @return the count of shared + */ + long getSharedCount(); + + /** + * @return the size of the origin file + */ + long getRawSize(); + /** + * delete data associated with this result + */ + void delete() throws IOException; +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultEmpty.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultEmpty.java new file mode 100644 index 00000000..55d0e385 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultEmpty.java @@ -0,0 +1,61 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; + +import java.io.IOException; + +public class CompressionResultEmpty implements CompressionResult { + @Override + public long getTripleCount() { + return 0; + } + + @Override + public ExceptionIterator getSubjects() { + return ExceptionIterator.empty(); + } + + @Override + public ExceptionIterator getPredicates() { + return ExceptionIterator.empty(); + } + + @Override + public ExceptionIterator getObjects() { + return ExceptionIterator.empty(); + } + + @Override + public long getSubjectsCount() { + return 0; + } + + @Override + public long getPredicatesCount() { + return 0; + } + + @Override + public long getObjectsCount() { + return 0; + } + + @Override + public long getSharedCount() { + return 0; + } + + @Override + public void delete() throws IOException { + } + + @Override + public long getRawSize() { + return 0; + } + + @Override + public void close() throws IOException { + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultFile.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultFile.java new file mode 100644 index 00000000..99f240c2 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultFile.java @@ -0,0 +1,89 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressNodeReader; + +import java.io.IOException; + +/** + * Implementation of {@link org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult} for full file reading + * @author Antoine Willerval + */ +public class CompressionResultFile implements CompressionResult { + private final long tripleCount; + private final long ntRawSize; + private final CompressNodeReader subjects; + private final CompressNodeReader predicates; + private final CompressNodeReader objects; + private final SectionCompressor.TripleFile sections; + + public CompressionResultFile(long tripleCount, long ntRawSize, SectionCompressor.TripleFile sections) throws IOException { + this.tripleCount = tripleCount; + this.ntRawSize = ntRawSize; + this.subjects = new CompressNodeReader(sections.openRSubject()); + this.predicates = new CompressNodeReader(sections.openRPredicate()); + this.objects = new CompressNodeReader(sections.openRObject()); + this.sections = sections; + } + + @Override + public long getTripleCount() { + return tripleCount; + } + + @Override + public ExceptionIterator getSubjects() { + return subjects; + } + + @Override + public ExceptionIterator getPredicates() { + return predicates; + } + + @Override + public ExceptionIterator getObjects() { + return objects; + } + + @Override + public void delete() throws IOException { + sections.delete(); + } + + @Override + public long getSubjectsCount() { + return subjects.getSize(); + } + + @Override + public long getPredicatesCount() { + return predicates.getSize(); + } + + @Override + public long getObjectsCount() { + return objects.getSize(); + } + + @Override + public long getSharedCount() { + return tripleCount; + } + + @Override + public long getRawSize() { + return ntRawSize; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll( + objects, + predicates, + subjects + ); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultPartial.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultPartial.java new file mode 100644 index 00000000..6a26b6c4 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultPartial.java @@ -0,0 +1,142 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressNodeMergeIterator; +import org.rdfhdt.hdt.util.io.compress.CompressNodeReader; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Function; + +/** + * Implementation of {@link org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult} for partial file reading + * + * @author Antoine Willerval + */ +public class CompressionResultPartial implements CompressionResult { + private final List files; + private final long triplesCount; + private final long ntSize; + private final ExceptionIterator subject; + private final ExceptionIterator predicate; + private final ExceptionIterator object; + + public CompressionResultPartial(List files, long triplesCount, long ntSize) throws IOException { + this.files = new ArrayList<>(files.size()); + this.ntSize = ntSize; + for (SectionCompressor.TripleFile file : files) { + this.files.add(new CompressNodeReaderTriple(file)); + } + this.triplesCount = triplesCount; + + // building iterator trees + this.subject = createBTree(0, files.size(), CompressNodeReaderTriple::getS); + this.predicate = createBTree(0, files.size(), CompressNodeReaderTriple::getP); + this.object = createBTree(0, files.size(), CompressNodeReaderTriple::getO); + } + + private ExceptionIterator createBTree(int start, int end, Function fetcher) { + int size = end - start; + if (size <= 0) { + return ExceptionIterator.empty(); + } + if (size == 1) { + return fetcher.apply(files.get(start)); + } + int mid = (start + end) / 2; + ExceptionIterator left = createBTree(start, mid, fetcher); + ExceptionIterator right = createBTree(mid, end, fetcher); + return new CompressNodeMergeIterator(left, right); + } + + @Override + public long getTripleCount() { + return triplesCount; + } + + @Override + public ExceptionIterator getSubjects() { + return subject; + } + + @Override + public ExceptionIterator getPredicates() { + return predicate; + } + + @Override + public ExceptionIterator getObjects() { + return object; + } + + @Override + public void delete() throws IOException { + IOUtil.closeAll(files); + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(files); + } + + /* + * use the count of triples because we don't know the number of subjects + */ + @Override + public long getSubjectsCount() { + return triplesCount; + } + + @Override + public long getPredicatesCount() { + return triplesCount; + } + + @Override + public long getObjectsCount() { + return triplesCount; + } + + @Override + public long getSharedCount() { + return triplesCount; + } + + @Override + public long getRawSize() { + return ntSize; + } + + private static class CompressNodeReaderTriple implements Closeable { + final CompressNodeReader s, p, o; + final SectionCompressor.TripleFile file; + + public CompressNodeReaderTriple(SectionCompressor.TripleFile file) throws IOException { + this.s = new CompressNodeReader(file.openRSubject()); + this.p = new CompressNodeReader(file.openRPredicate()); + this.o = new CompressNodeReader(file.openRObject()); + this.file = file; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(s, p, o); + } + + public CompressNodeReader getS() { + return s; + } + + public CompressNodeReader getP() { + return p; + } + + public CompressNodeReader getO() { + return o; + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java new file mode 100644 index 00000000..51b511d9 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java @@ -0,0 +1,459 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; +import org.rdfhdt.hdt.iterator.utils.SizeFetcher; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.ParallelSortableArrayList; +import org.rdfhdt.hdt.util.concurrent.ExceptionFunction; +import org.rdfhdt.hdt.util.concurrent.ExceptionSupplier; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.concurrent.KWayMerger; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressNodeMergeIterator; +import org.rdfhdt.hdt.util.io.compress.CompressNodeReader; +import org.rdfhdt.hdt.util.io.compress.CompressUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Function; +import java.util.function.Supplier; + +/** + * Tree worker object to compress the section of a triple stream into 3 sections (SPO) and a compress triple file + * + * @author Antoine Willerval + */ +public class SectionCompressor implements KWayMerger.KWayMergerImpl> { + private static final Logger log = LoggerFactory.getLogger(SectionCompressor.class); + + private final CloseSuppressPath baseFileName; + private final AsyncIteratorFetcher source; + private boolean done; + private final MultiThreadListener listener; + private final AtomicLong triples = new AtomicLong(); + private final AtomicLong ntRawSize = new AtomicLong(); + private final int bufferSize; + private final long chunkSize; + private final int k; + + public SectionCompressor(CloseSuppressPath baseFileName, AsyncIteratorFetcher source, MultiThreadListener listener, int bufferSize, long chunkSize, int k) { + this.source = source; + this.listener = listener; + this.baseFileName = baseFileName; + this.bufferSize = bufferSize; + this.chunkSize = chunkSize; + this.k = k; + } + + /* + * FIXME: create a factory and override these methods with the hdt spec + */ + + /** + * mapping method for the subject of the triple, this method should copy the sequence! + * + * @param seq the subject (before) + * @return the subject mapped + */ + protected CharSequence convertSubject(CharSequence seq) { + return seq.toString(); + } + + /** + * mapping method for the predicate of the triple, this method should copy the sequence! + * + * @param seq the predicate (before) + * @return the predicate mapped + */ + protected CharSequence convertPredicate(CharSequence seq) { + return seq.toString(); + } + + /** + * mapping method for the object of the triple, this method should copy the sequence! + * + * @param seq the object (before) + * @return the object mapped + */ + protected CharSequence convertObject(CharSequence seq) { + return seq.toString(); + } + + /** + * Compress the stream into complete pre-sections files + * + * @param workers the number of workers + * @return compression result + * @throws IOException io exception + * @throws InterruptedException if the thread is interrupted + * @throws KWayMerger.KWayMergerException exception with the tree working + * @see #compressPartial() + * @see #compress(int, String) + */ + public CompressionResult compressToFile(int workers) throws IOException, InterruptedException, KWayMerger.KWayMergerException { + // force to create the first file + KWayMerger> merger = new KWayMerger<>(baseFileName, source, this, Math.max(1, workers - 1), k); + merger.start(); + // wait for the workers to merge the sections and create the triples + Optional sections = merger.waitResult(); + if (sections.isEmpty()) { + return new CompressionResultEmpty(); + } + return new CompressionResultFile(triples.get(), ntRawSize.get(), new TripleFile(sections.get(), false)); + } + + /** + * Compress the stream into multiple pre-sections files and merge them on the fly + * + * @return compression result + * @throws IOException io exception + * @see #compressToFile(int) + * @see #compress(int, String) + */ + public CompressionResult compressPartial() throws IOException, KWayMerger.KWayMergerException { + List files = new ArrayList<>(); + baseFileName.closeWithDeleteRecurse(); + try { + baseFileName.mkdirs(); + long fileName = 0; + while (!source.isEnd()) { + TripleFile file = new TripleFile(baseFileName.resolve("chunk#"+fileName++), true); + createChunk(newStopFlux(source), file.root); + files.add(file); + } + } catch (Throwable e) { + try { + throw e; + } finally { + try { + IOUtil.closeAll(files); + } finally { + baseFileName.close(); + } + } + } + return new CompressionResultPartial(files, triples.get(), ntRawSize.get()); + } + + /** + * compress the sections/triples with a particular mode + * + * @param workers the worker required + * @param mode the mode to compress, can be {@link org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult#COMPRESSION_MODE_COMPLETE} (default), {@link org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult#COMPRESSION_MODE_PARTIAL} or null/"" for default + * @return the compression result + * @throws KWayMerger.KWayMergerException tree working exception + * @throws IOException io exception + * @throws InterruptedException thread interruption + * @see #compressToFile(int) + * @see #compressPartial() + */ + public CompressionResult compress(int workers, String mode) throws KWayMerger.KWayMergerException, IOException, InterruptedException { + if (mode == null) { + mode = ""; + } + switch (mode) { + case "": + case CompressionResult.COMPRESSION_MODE_COMPLETE: + return compressToFile(workers); + case CompressionResult.COMPRESSION_MODE_PARTIAL: + return compressPartial(); + default: + throw new IllegalArgumentException("Unknown compression mode: " + mode); + } + } + + @Override + public void createChunk(SizeFetcher fetcher, CloseSuppressPath output) throws KWayMerger.KWayMergerException { + + listener.notifyProgress(0, "start reading triples"); + + ParallelSortableArrayList subjects = new ParallelSortableArrayList<>(IndexedNode[].class); + ParallelSortableArrayList predicates = new ParallelSortableArrayList<>(IndexedNode[].class); + ParallelSortableArrayList objects = new ParallelSortableArrayList<>(IndexedNode[].class); + + listener.notifyProgress(10, "reading triples " + triples.get()); + TripleString next; + while ((next = fetcher.get()) != null) { + + // load the map triple and write it in the writer + long tripleID = triples.incrementAndGet(); + + // get indexed mapped char sequence + IndexedNode subjectNode = new IndexedNode( + convertSubject(next.getSubject()), + tripleID + ); + subjects.add(subjectNode); + + // get indexed mapped char sequence + IndexedNode predicateNode = new IndexedNode( + convertPredicate(next.getPredicate()), + tripleID + ); + predicates.add(predicateNode); + + // get indexed mapped char sequence + IndexedNode objectNode = new IndexedNode( + convertObject(next.getObject()), + tripleID + ); + objects.add(objectNode); + + + if (tripleID % 100_000 == 0) { + listener.notifyProgress(10, "reading triples " + tripleID); + } + // too much ram allowed? + if (subjects.size() == Integer.MAX_VALUE - 6) { + break; + } + } + + ntRawSize.addAndGet(fetcher.getSize()); + + try { + TripleFile sections = new TripleFile(output, true); + try { + IntermediateListener il = new IntermediateListener(listener); + il.setRange(70, 80); + il.setPrefix("creating subjects section " + sections.root.getFileName() + ": "); + il.notifyProgress(0, "sorting"); + try (OutputStream stream = sections.openWSubject()) { + subjects.parallelSort(IndexedNode::compareTo); + CompressUtil.writeCompressedSection(subjects, stream, il); + } + il.setRange(80, 90); + il.setPrefix("creating predicates section " + sections.root.getFileName() + ": "); + il.notifyProgress(0, "sorting"); + try (OutputStream stream = sections.openWPredicate()) { + predicates.parallelSort(IndexedNode::compareTo); + CompressUtil.writeCompressedSection(predicates, stream, il); + } + il.setRange(90, 100); + il.setPrefix("creating objects section " + sections.root.getFileName() + ": "); + il.notifyProgress(0, "sorting"); + try (OutputStream stream = sections.openWObject()) { + objects.parallelSort(IndexedNode::compareTo); + CompressUtil.writeCompressedSection(objects, stream, il); + } + } finally { + subjects.clear(); + predicates.clear(); + objects.clear(); + listener.notifyProgress(100, "section completed" + sections.root.getFileName().toString()); + } + } catch (IOException e) { + throw new KWayMerger.KWayMergerException(e); + } + } + + @Override + public void mergeChunks(List inputs, CloseSuppressPath output) throws KWayMerger.KWayMergerException { + TripleFile sections; + try { + sections = new TripleFile(output, true); + List tripleFiles = new ArrayList<>(); + for (CloseSuppressPath in : inputs) { + tripleFiles.add(new TripleFile(in, false)); + } + sections.compute(tripleFiles, false); + listener.notifyProgress(100, "sections merged " + sections.root.getFileName()); + // delete old sections + IOUtil.closeAll(inputs); + } catch (IOException | InterruptedException e) { + throw new KWayMerger.KWayMergerException(e); + } + } + + @Override + public SizeFetcher newStopFlux(Supplier flux) { + return SizeFetcher.ofTripleString(flux, chunkSize); + } + + /** + * A triple directory, contains 3 files, subject, predicate and object + * + * @author Antoine Willerval + */ + public class TripleFile implements Closeable { + private final CloseSuppressPath root; + private final CloseSuppressPath s; + private final CloseSuppressPath p; + private final CloseSuppressPath o; + + private TripleFile(CloseSuppressPath root, boolean mkdir) throws IOException { + this.root = root; + this.s = root.resolve("subject"); + this.p = root.resolve("predicate"); + this.o = root.resolve("object"); + + root.closeWithDeleteRecurse(); + if (mkdir) { + root.mkdirs(); + } + } + + @Override + public void close() throws IOException { + delete(); + } + + public void delete() throws IOException { + root.close(); + } + + /** + * @return open a write stream to the subject file + * @throws IOException can't open the stream + */ + public OutputStream openWSubject() throws IOException { + return s.openOutputStream(bufferSize); + } + + /** + * @return open a write stream to the predicate file + * @throws IOException can't open the stream + */ + public OutputStream openWPredicate() throws IOException { + return p.openOutputStream(bufferSize); + } + + /** + * @return open a write stream to the object file + * @throws IOException can't open the stream + */ + public OutputStream openWObject() throws IOException { + return o.openOutputStream(bufferSize); + } + + /** + * @return open a read stream to the subject file + * @throws IOException can't open the stream + */ + public InputStream openRSubject() throws IOException { + return s.openInputStream(bufferSize); + } + + /** + * @return open a read stream to the predicate file + * @throws IOException can't open the stream + */ + public InputStream openRPredicate() throws IOException { + return p.openInputStream(bufferSize); + } + + /** + * @return open a read stream to the object file + * @throws IOException can't open the stream + */ + public InputStream openRObject() throws IOException { + return o.openInputStream(bufferSize); + } + + /** + * @return the path to the subject file + */ + public CloseSuppressPath getSubjectPath() { + return s; + } + + /** + * @return the path to the predicate file + */ + public CloseSuppressPath getPredicatePath() { + return p; + } + + /** + * @return the path to the object file + */ + public CloseSuppressPath getObjectPath() { + return o; + } + + /** + * compute this triple file from multiple triples files + * + * @param triples triples files container + * @param async if the method should load all the files asynchronously or not + * @throws IOException io exception while reading/writing + * @throws InterruptedException interruption while waiting for the async thread + */ + public void compute(List triples, boolean async) throws IOException, InterruptedException { + if (!async) { + computeSubject(triples, false); + computePredicate(triples, false); + computeObject(triples, false); + } else { + ExceptionThread.async("SectionMerger" + root.getFileName(), + () -> computeSubject(triples, true), + () -> computePredicate(triples, true), + () -> computeObject(triples,true) + ).joinAndCrashIfRequired(); + } + } + + private void computeSubject(List triples, boolean async) throws IOException { + computeSection(triples, "subject", 0, 33, this::openWSubject, TripleFile::openRSubject, TripleFile::getSubjectPath, async); + } + + private void computePredicate(List triples, boolean async) throws IOException { + computeSection(triples, "predicate", 33, 66, this::openWPredicate, TripleFile::openRPredicate, TripleFile::getPredicatePath, async); + } + + private void computeObject(List triples, boolean async) throws IOException { + computeSection(triples, "object", 66, 100, this::openWObject, TripleFile::openRObject, TripleFile::getObjectPath, async); + } + + private void computeSection(List triples, String section, int start, int end, ExceptionSupplier openW, ExceptionFunction openR, Function fileDelete, boolean async) throws IOException { + IntermediateListener il = new IntermediateListener(listener); + if (async) { + listener.registerThread(Thread.currentThread().getName()); + } else { + il.setRange(start, end); + } + il.setPrefix("merging " + section + " section " + root.getFileName() + ": "); + il.notifyProgress(0, "merging section"); + + // readers to create the merge tree + CompressNodeReader[] readers = new CompressNodeReader[triples.size()]; + Closeable[] fileDeletes = new Closeable[triples.size()]; + try { + long size = 0L; + for (int i = 0; i < triples.size(); i++) { + CompressNodeReader reader = new CompressNodeReader(openR.apply(triples.get(i))); + size += reader.getSize(); + readers[i] = reader; + fileDeletes[i] = fileDelete.apply(triples.get(i)); + } + + // section + try (OutputStream output = openW.get()) { + CompressUtil.writeCompressedSection(CompressNodeMergeIterator.buildOfTree(readers), size, output, il); + } + } finally { + if (async) { + listener.unregisterThread(Thread.currentThread().getName()); + } + try { + IOUtil.closeAll(readers); + } finally { + IOUtil.closeAll(fileDeletes); + } + } + } + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResult.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResult.java new file mode 100644 index 00000000..901d2617 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResult.java @@ -0,0 +1,20 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.triples.TempTriples; + +import java.io.Closeable; + +/** + * Result for the {@link org.rdfhdt.hdt.util.io.compress.MapCompressTripleMerger} + * @author Antoine Willerval + */ +public interface TripleCompressionResult extends Closeable { + /** + * @return a sorted iterator of subject + */ + TempTriples getTriples(); + /** + * @return the number of triples + */ + long getTripleCount(); +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultEmpty.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultEmpty.java new file mode 100644 index 00000000..b06cad74 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultEmpty.java @@ -0,0 +1,42 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.impl.OneReadTempTriples; + +import java.io.IOException; +import java.util.Iterator; + +public class TripleCompressionResultEmpty implements TripleCompressionResult { + private final TripleComponentOrder order; + + public TripleCompressionResultEmpty(TripleComponentOrder order) { + this.order = order; + } + + @Override + public TempTriples getTriples() { + return new OneReadTempTriples(new Iterator<>() { + @Override + public boolean hasNext() { + return false; + } + + @Override + public TripleID next() { + return null; + } + }, order, 0); + } + + @Override + public long getTripleCount() { + return 0; + } + + @Override + public void close() throws IOException { + + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultFile.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultFile.java new file mode 100644 index 00000000..cb51e55d --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultFile.java @@ -0,0 +1,44 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.impl.OneReadTempTriples; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressTripleReader; + +import java.io.IOException; + +/** + * Implementation of {@link org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult} for full file reading + * + * @author Antoine Willerval + */ +public class TripleCompressionResultFile implements TripleCompressionResult { + private final long tripleCount; + private final CompressTripleReader reader; + private final TripleComponentOrder order; + private final CloseSuppressPath triples; + + public TripleCompressionResultFile(long tripleCount, CloseSuppressPath triples, TripleComponentOrder order, int bufferSize) throws IOException { + this.tripleCount = tripleCount; + this.reader = new CompressTripleReader(triples.openInputStream(bufferSize)); + this.order = order; + this.triples = triples; + } + + @Override + public TempTriples getTriples() { + return new OneReadTempTriples(reader.asIterator(), order, tripleCount); + } + + @Override + public long getTripleCount() { + return tripleCount; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(reader, triples); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultPartial.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultPartial.java new file mode 100644 index 00000000..ebb777a0 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultPartial.java @@ -0,0 +1,64 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.impl.OneReadTempTriples; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressTripleMergeIterator; +import org.rdfhdt.hdt.util.io.compress.CompressTripleReader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Implementation of {@link org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult} for partial file reading + * @author Antoine Willerval + */ +public class TripleCompressionResultPartial implements TripleCompressionResult { + private final List files; + private final TempTriples triples; + private final long tripleCount; + private final TripleComponentOrder order; + + public TripleCompressionResultPartial(List files, long tripleCount, TripleComponentOrder order, int bufferSize) throws IOException { + this.files = new ArrayList<>(files.size()); + this.tripleCount = tripleCount; + this.order = order; + this.triples = new OneReadTempTriples(createBTree(files, 0, files.size(), bufferSize).asIterator(), order, tripleCount); + } + + private ExceptionIterator createBTree(List files, int start, int end, int bufferSize) throws IOException { + int size = end - start; + if (size <= 0) { + return ExceptionIterator.empty(); + } + if (size == 1) { + CompressTripleReader r = new CompressTripleReader(files.get(start).openInputStream(bufferSize)); + this.files.add(r); + return r; + } + int mid = (start + end) / 2; + ExceptionIterator left = createBTree(files, start, mid, bufferSize); + ExceptionIterator right = createBTree(files, mid, end, bufferSize); + return new CompressTripleMergeIterator(left, right, order); + } + + @Override + public TempTriples getTriples() { + return triples; + } + + @Override + public long getTripleCount() { + return tripleCount; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(files); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java index 17b88c80..2444a489 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java @@ -75,12 +75,14 @@ public PlainHeader(HDTOptions spec) { */ @Override public void insert(CharSequence subject, CharSequence predicate, CharSequence object) { + TripleString tripleString; String objStr = object.toString(); if(objStr.charAt(0)=='<'|| objStr.charAt(0)=='"' || objStr.startsWith("http://")||objStr.startsWith("file://")) { - triples.add(new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), HeaderUtil.cleanURI(object))); + tripleString = new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), HeaderUtil.cleanURI(object)); } else { - triples.add(new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), '"'+objStr+'"')); + tripleString = new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), '"'+objStr+'"'); } + triples.add(tripleString); } /* (non-Javadoc) @@ -152,7 +154,13 @@ public int getNumberOfElements() { */ @Override public IteratorTripleString search(CharSequence subject, CharSequence predicate, CharSequence object) { - TripleString pattern = new TripleString(subject.toString(), predicate.toString(), object.toString()); + TripleString pattern; + String objStr = object.toString(); + if(objStr.isEmpty() || objStr.charAt(0)=='<'|| objStr.charAt(0)=='"' || objStr.startsWith("http://")||objStr.startsWith("file://")) { + pattern = new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), HeaderUtil.cleanURI(object)); + } else { + pattern = new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), '"'+objStr+'"'); + } return new PlainHeaderIterator(this, pattern); } @@ -164,13 +172,7 @@ public void processTriple(TripleString triple, long pos) { @Override public void remove(CharSequence subject, CharSequence predicate, CharSequence object) { TripleString pattern = new TripleString(subject.toString(), predicate.toString(), object.toString()); - Iterator iter = triples.iterator(); - while(iter.hasNext()) { - TripleString next = iter.next(); - if(next.match(pattern)) { - iter.remove(); - } - } + triples.removeIf(next -> next.match(pattern)); } @Override diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/AsyncIteratorFetcher.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/AsyncIteratorFetcher.java new file mode 100644 index 00000000..119c3df2 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/AsyncIteratorFetcher.java @@ -0,0 +1,46 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Supplier; + +/** + * Synchronise an iterator + * + * @param iterator type + * @author Antoine Willerval + */ +public class AsyncIteratorFetcher implements Supplier { + private final Iterator iterator; + private final Lock lock = new ReentrantLock(); + private boolean end; + + public AsyncIteratorFetcher(Iterator iterator) { + this.iterator = iterator; + } + + /** + * @return an element from the iterator, this method is thread safe + */ + @Override + public E get() { + lock.lock(); + try { + if (iterator.hasNext()) { + return iterator.next(); + } + end = true; + return null; + } finally { + lock.unlock(); + } + } + + /** + * @return is the end + */ + public boolean isEnd() { + return end; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java new file mode 100644 index 00000000..bf59bdfb --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java @@ -0,0 +1,168 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.Objects; +import java.util.function.Consumer; + +/** + * alternative iterator with exception throwing + * @param the iterator type + * @param the allowed exception + * @author Antoine Willerval + */ +public interface ExceptionIterator { + @FunctionalInterface + interface ExceptionConsumer { + void consume(T element) throws E; + } + /** + * create an exception iterator from a basic iterator + * @param it the iterator the wrap + * @param the iterator type + * @param the exception to allow + * @return exception iterator + */ + static ExceptionIterator of(final Iterator it) { + return new ExceptionIterator<>() { + @Override + public boolean hasNext() { + return it.hasNext(); + } + + @Override + public T next() { + return it.next(); + } + + @Override + public void remove() { + it.remove(); + } + }; + } /** + * create an empty iterator + * @param the iterator type + * @param the exception to allow + * @return exception iterator + */ + static ExceptionIterator empty() { + return of(new Iterator<>() { + @Override + public boolean hasNext() { + return false; + } + + @Override + public T next() { + return null; + } + }); + } + + + /** + * @return if the iterator has a next element + * @throws E exception triggered by the implementation + */ + boolean hasNext() throws E; + + /** + * @return the next iterator element + * @throws E exception triggered by the implementation + */ + T next() throws E; + + /** + * remove the last element returned by the iterator + * @throws E exception triggered by the implementation + */ + default void remove() throws E { + throw new UnsupportedOperationException("remove"); + } + + /** + * loop over all the elements + * @param action the action to handle the element + * @throws E exception triggered by the implementation + */ + default void forEachRemaining(ExceptionConsumer action) throws E { + Objects.requireNonNull(action); + while (hasNext()) + action.consume(next()); + } + + /** + * map this iterator with a function + * @param mappingFunc the mapping function + * @param the new iterator type + * @return iterator + */ + default ExceptionIterator map(MapExceptionIterator.ExceptionFunction mappingFunc) { + return new MapExceptionIterator<>(this, mappingFunc); + } + /** + * map this iterator with a function + * @param mappingFunc the mapping function + * @param the new iterator type + * @return iterator + */ + default ExceptionIterator map(MapExceptionIterator.MapWithIdFunction mappingFunc) { + return new MapExceptionIterator<>(this, mappingFunc); + } + + /** + * convert this exception iterator to a base iterator and convert the exception to RuntimeException + * @return iterator + */ + default Iterator asIterator() { + return new Iterator<>() { + @Override + public boolean hasNext() { + try { + return ExceptionIterator.this.hasNext(); + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException(e); + } + } + + @Override + public T next() { + try { + return ExceptionIterator.this.next(); + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException(e); + } + } + + @Override + public void forEachRemaining(Consumer action) { + try { + ExceptionIterator.this.forEachRemaining(action::accept); + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException(e); + } + } + + @Override + public void remove() { + try { + ExceptionIterator.this.remove(); + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException(e); + } + } + }; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileChunkIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileChunkIterator.java new file mode 100644 index 00000000..5ce8d019 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileChunkIterator.java @@ -0,0 +1,105 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.function.Consumer; +import java.util.function.ToLongFunction; + +/** + * Iterator to split an iterator stream into multiple files, the iterator return {@link #hasNext()} == true once the + * first file is returned, then the {@link #hasNewFile()} should be called to check if another file can be created and + * re-allow {@link #hasNext()} to return true + * @author Antoine Willerval + */ +public class FileChunkIterator implements Iterator { + private final ToLongFunction estimationFunction; + private final Iterator it; + private final long maxSize; + private long totalSize = 0L; + private long currentSize = 0L; + private E next; + private boolean stop = false; + + /** + * create a file iterator from a stream and a max size + * @param it the iterator + * @param maxSize the maximum size of each file, this size is estimated, so files can be bigger. + * @param estimationFunction the element estimation function + */ + public FileChunkIterator(Iterator it, long maxSize, ToLongFunction estimationFunction) { + this.it = it; + this.maxSize = maxSize; + this.estimationFunction = estimationFunction; + } + + @Override + public boolean hasNext() { + if (stop) + return false; + + if (next != null) + return true; + + if (it.hasNext()) { + next = it.next(); + long estimation = estimationFunction.applyAsLong(next); + + totalSize += estimation; + + if (currentSize + estimation >= maxSize) { + stop = true; + currentSize = estimation; + return false; + } + + currentSize += estimation; + return true; + } + return false; + } + + @Override + public E next() { + if (!hasNext()) { + return null; + } + E t = next; + next = null; + return t; + } + + @Override + public void remove() { + it.remove(); + } + + @Override + public void forEachRemaining(Consumer action) { + it.forEachRemaining(action); + } + + /** + * force the iterator to create a new file after the next hasNext() + */ + public void forceNewFile() { + long estimation; + if (next != null) { + estimation = estimationFunction.applyAsLong(next); + } else { + estimation = 0; + } + currentSize = estimation; + stop = true; + } + + /** + * @return if we need to open a new file + */ + public boolean hasNewFile() { + stop = false; + return hasNext(); + } + + public long getTotalSize() { + return totalSize; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIDIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIDIterator.java new file mode 100644 index 00000000..22890e3f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIDIterator.java @@ -0,0 +1,17 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.triples.TripleID; + +import java.util.Iterator; + +public class FileTripleIDIterator extends FileChunkIterator { + /** + * create a file iterator from a stream and a max size + * + * @param it the iterator + * @param maxSize the maximum size of each file, this size is estimated, so files can be bigger. + */ + public FileTripleIDIterator(Iterator it, long maxSize) { + super(it, maxSize, tripleID -> 4L * Long.BYTES); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java new file mode 100644 index 00000000..249a8e73 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java @@ -0,0 +1,37 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.string.ByteStringUtil; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.Iterator; +import java.util.function.Consumer; +import java.util.function.ToIntFunction; + +/** + * Iterator to split an iterator stream into multiple files, the iterator return {@link #hasNext()} == true once the + * first file is returned, then the {@link #hasNewFile()} should be called to check if another file can be created and + * re-allow {@link #hasNext()} to return true + * @author Antoine Willerval + */ +public class FileTripleIterator extends FileChunkIterator { + public static long estimateSize(TripleString tripleString) { + try { + return tripleString.asNtriple().toString().getBytes(ByteStringUtil.STRING_ENCODING).length; + } catch (IOException e) { + throw new RuntimeException("Can't estimate the size of the triple " + tripleString, e); + } + } + + /** + * create a file iterator from a stream and a max size + * + * @param it the iterator + * @param maxSize the maximum size of each file, this size is estimated, so files can be bigger. + */ + public FileTripleIterator(Iterator it, long maxSize) { + super(it, maxSize, FileTripleIterator::estimateSize); + } + +} \ No newline at end of file diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapExceptionIterator.java new file mode 100644 index 00000000..5eef7e71 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapExceptionIterator.java @@ -0,0 +1,47 @@ +package org.rdfhdt.hdt.iterator.utils; + +/** + * Exception Iterator to map a value to another + * @param origin type + * @param return type + * @param the allowed exception + * @author Antoine Willerval + */ +public class MapExceptionIterator implements ExceptionIterator { + private final MapWithIdFunction mappingFunction; + private final ExceptionIterator base; + private long index; + + public MapExceptionIterator(ExceptionIterator base, ExceptionFunction mappingFunction) { + this(base, (m, i) -> mappingFunction.apply(m)); + } + public MapExceptionIterator(ExceptionIterator base, MapWithIdFunction mappingFunction) { + this.base = base; + this.mappingFunction = mappingFunction; + } + + @Override + public boolean hasNext() throws E { + return base.hasNext(); + } + + @Override + public N next() throws E { + return mappingFunction.apply(base.next(), index++); + } + + @Override + public void remove() throws E{ + base.remove(); + } + + @FunctionalInterface + public interface MapWithIdFunction { + N apply(M element, long index) throws E; + } + + @FunctionalInterface + public interface ExceptionFunction { + N apply(M element) throws E; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java new file mode 100644 index 00000000..a24d9e81 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java @@ -0,0 +1,136 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.function.Function; + +public class MergeExceptionIterator implements ExceptionIterator { + + /** + * Create a tree of merge iterators from an array of element + * @param itFunction a function to create an iterator from an element + * @param comp comparator for the merge iterator + * @param array the elements + * @param length the number of elements + * @param input of the element + * @param type of the element in the iterator + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, Comparator comp, I[] array, int length) { + return buildOfTree(itFunction, comp, array, 0, length); + } + + /** + * Create a tree of merge iterators from an array of element + * @param itFunction a function to create an iterator from an element + * @param comp comparator for the merge iterator + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, Comparator comp, I[] array, int start, int end) { + return buildOfTree(itFunction, comp, Arrays.asList(array), start, end); + } + + /** + * Create a tree of merge iterators from an array of element + * @param itFunction a function to create an iterator from an element + * @param comp comparator for the merge iterator + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, Comparator comp, List array, int start, int end) { + int length = end - start; + if (length <= 0) { + return ExceptionIterator.empty(); + } + if (length == 1) { + return itFunction.apply(array.get(start)); + } + int mid = (start + end) / 2; + return new MergeExceptionIterator<>( + buildOfTree(itFunction, comp, array, start, mid), + buildOfTree(itFunction, comp, array, mid, end), + comp + ); + } + + private final ExceptionIterator in1, in2; + private final Comparator comp; + private T next; + private T prevE1; + private T prevE2; + + public MergeExceptionIterator(ExceptionIterator in1, ExceptionIterator in2, Comparator comp) { + this.in1 = in1; + this.in2 = in2; + this.comp = comp; + } + + @Override + public boolean hasNext() throws E { + if (next != null) { + return true; + } + + // read next element 1 if required + if (prevE1 == null && in1.hasNext()) { + prevE1 = in1.next(); + } + // read next element 2 if required + if (prevE2 == null && in2.hasNext()) { + prevE2 = in2.next(); + } + + if (prevE1 != null && prevE2 != null) { + // we have an element from both stream, compare them + if (comp.compare(prevE1, prevE2) < 0) { + // element 1 lower, return it + next = prevE1; + prevE1 = null; + } else { + // element 2 lower, return it + next = prevE2; + prevE2 = null; + } + return true; + } + // we have at most one element + if (prevE1 != null) { + // return element 1 + next = prevE1; + prevE1 = null; + return true; + } + if (prevE2 != null) { + // return element 2 + next = prevE2; + prevE2 = null; + return true; + } + // nothing else + return false; + } + + @Override + public T next() throws E { + if (!hasNext()) { + return null; + } + T next = this.next; + this.next = null; + return next; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java new file mode 100644 index 00000000..dd372185 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java @@ -0,0 +1,57 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.listener.ProgressListener; + +import java.util.Objects; + +/** + * ExceptionIterator Wrapper to notify a progress + * + * @param iterator type + * @param iterator exception + * @author Antoine WillervalF + */ +public class NotificationExceptionIterator implements ExceptionIterator { + private final ExceptionIterator it; + private final long size; + private final long split; + private final String message; + private final ProgressListener listener; + private long current = 0L; + + public NotificationExceptionIterator(ExceptionIterator it, long size, long split, String message, ProgressListener listener) { + this.it = Objects.requireNonNull(it, "it can't be null!"); + if (size < 0) { + throw new IllegalArgumentException("size can't be negative!"); + } + if (split < 0) { + throw new IllegalArgumentException("split can't be negative! " + split); + } + // set size to be at least 1 to allow empty next() error + this.size = Math.max(1, size); + // minimize split by size to avoid dividing by 0 + this.split = Math.min(split, size); + this.message = Objects.requireNonNull(message, "message can't be null!"); + this.listener = Objects.requireNonNullElseGet(listener, () -> (perc, msg) -> { + }); + } + + @Override + public boolean hasNext() throws E { + return it.hasNext(); + } + + @Override + public T next() throws E { + current++; + if (current % (size / split) == 0) { + listener.notifyProgress((float) current / size, message + " " + current + "/" + size); + } + return it.next(); + } + + @Override + public void remove() throws E { + it.remove(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java index 2c279575..904ffd44 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java @@ -1,14 +1,6 @@ package org.rdfhdt.hdt.iterator.utils; -import org.rdfhdt.hdt.compact.integer.VByte; -import org.rdfhdt.hdt.util.io.IOUtil; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.PipedInputStream; -import java.io.PipedOutputStream; -import java.nio.charset.StandardCharsets; +import java.util.concurrent.ArrayBlockingQueue; import java.util.Iterator; import java.util.function.Function; @@ -20,6 +12,7 @@ */ public class PipedCopyIterator implements Iterator { + /** * RuntimeException generated by the PipedCopyIterator * @@ -33,7 +26,7 @@ public PipedIteratorException(String message, Throwable t) { /** - * Callback for the {@link #createOfCallback(PipedCopyIterator.Parser, PipedCopyIterator.PipeCallBack)} method + * Callback for the {@link #createOfCallback(PipedCopyIterator.PipeCallBack)} method * * @param the iterator type * @author Antoine Willerval @@ -53,13 +46,12 @@ public interface PipeCallBack { /** * create a piped iterator from a callback runner, the call to the callback should be made in the callbackRunner * - * @param serializer serializer to pass the data * @param callbackRunner the callback runner * @param type of the iterator * @return the iterator */ - public static PipedCopyIterator createOfCallback(Parser serializer, PipeCallBack callbackRunner) { - PipedCopyIterator pipe = new PipedCopyIterator<>(serializer); + public static PipedCopyIterator createOfCallback(PipeCallBack callbackRunner) { + PipedCopyIterator pipe = new PipedCopyIterator<>(); Thread thread = new Thread(() -> { try { @@ -73,46 +65,49 @@ public static PipedCopyIterator createOfCallback(Parser serializer, Pi return pipe; } - public interface Parser { - static void writeString(CharSequence s, OutputStream out) throws IOException { - byte[] bytes = s.toString().getBytes(StandardCharsets.UTF_8); - VByte.encode(out, bytes.length); - out.write(bytes); - } - static String readString(InputStream in) throws IOException { - int size = (int) VByte.decode(in); - byte[] bytes = IOUtil.readBuffer(in, size, null); - return new String(bytes, StandardCharsets.UTF_8); - } - void write(T t, OutputStream stream) throws IOException; - T read(InputStream stream) throws IOException; + + private interface QueueObject { + boolean end(); + + T get(); } - private final PipedInputStream in; - private final PipedOutputStream out; - private final Parser serializer; - private T next; - private boolean end; - private PipedIteratorException exception; + private class ElementQueueObject implements QueueObject { + private final T obj; - public PipedCopyIterator(Parser serializer) { - this.serializer = serializer; - try { - in = new PipedInputStream(); - out = new PipedOutputStream(); - in.connect(out); - } catch (IOException e) { - throw new PipedIteratorException("can't connect pipe", e); + private ElementQueueObject(T obj) { + this.obj = obj; + } + + + @Override + public boolean end() { + return false; + } + + @Override + public T get() { + return obj; } } - private int readByte() { - try { - return in.read(); - } catch (IOException e) { - throw new PipedIteratorException("Can't read byte", e); + private class EndQueueObject implements QueueObject { + @Override + public boolean end() { + return true; + } + + @Override + public T get() { + throw new IllegalArgumentException(); } } + private final ArrayBlockingQueue> queue = new ArrayBlockingQueue<>(16); + + private T next; + private boolean end; + private PipedIteratorException exception; + @Override public boolean hasNext() { if (end) { @@ -122,19 +117,21 @@ public boolean hasNext() { return true; } - int b = readByte(); - if (b == 0) { + QueueObject obj; + try { + obj = queue.take(); + } catch (InterruptedException e) { + throw new PipedIteratorException("Can't read pipe", e); + } + + if (obj.end()) { end = true; if (exception != null) { throw exception; } return false; } - try { - next = serializer.read(in); - } catch (IOException e) { - throw new PipedIteratorException("Can't read pipe", e); - } + next = obj.get(); return true; } @@ -160,9 +157,8 @@ public void closePipe(Throwable e) { } } try { - // end byte - out.write(0); - } catch (IOException ee) { + queue.put(new EndQueueObject()); + } catch (InterruptedException ee) { throw new PipedIteratorException("Can't close pipe", ee); } } @@ -188,10 +184,8 @@ public Iterator mapWithId(MapIterator.MapWithIdFunction mappingFunc public void addElement(T node) { try { - // not end byte - out.write(1); - serializer.write(node, out); - } catch (IOException ee) { + queue.put(new ElementQueueObject(node)); + } catch (InterruptedException ee) { throw new PipedIteratorException("Can't add element to pipe", ee); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SizeFetcher.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SizeFetcher.java new file mode 100644 index 00000000..e91bca60 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SizeFetcher.java @@ -0,0 +1,59 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.TripleString; + +import java.util.function.Supplier; +import java.util.function.ToLongFunction; + +/** + * Size a supplier AFTER the last element, the maximum sum size will be maxSize - 1 + sizeOf(lastElement) + * + * @param supplier type + */ +public class SizeFetcher implements Supplier { + public static SizeFetcher ofTripleString(Supplier supplier, long maxSize) { + return new SizeFetcher<>(supplier, FileTripleIterator::estimateSize, maxSize); + } + + public static SizeFetcher ofTripleLong(Supplier supplier, long maxSize) { + return new SizeFetcher<>(supplier, tripleID -> 4L * Long.BYTES, maxSize); + } + + private final Supplier supplier; + private final ToLongFunction sizeGetter; + + private final long maxSize; + + private long size; + + public SizeFetcher(Supplier supplier, ToLongFunction sizeGetter, long maxSize) { + this.supplier = supplier; + this.sizeGetter = sizeGetter; + this.maxSize = maxSize; + } + + @Override + public E get() { + if (!canContinue()) { + return null; + } + E e = supplier.get(); + + if (e == null) { + return null; + } + + size += sizeGetter.applyAsLong(e); + + return e; + } + + private boolean canContinue() { + return size < maxSize; + } + + public long getSize() { + return size; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java index 40bde70d..cc5018d2 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java @@ -94,25 +94,8 @@ public static RDFParserCallback getParserCallback(RDFNotation notation, boolean * @param notation the rdf notation to parse * @return iterator */ - public static Iterator readAsIterator(RDFParserCallback parser, InputStream stream, String baseUri, boolean keepBNode, RDFNotation notation) { - return PipedCopyIterator.createOfCallback(TripleStringParser.INSTANCE, pipe -> parser.doParse(stream, baseUri, notation, keepBNode, (triple, pos) -> pipe.addElement(triple))); + public static PipedCopyIterator readAsIterator(RDFParserCallback parser, InputStream stream, String baseUri, boolean keepBNode, RDFNotation notation) { + return PipedCopyIterator.createOfCallback(pipe -> parser.doParse(stream, baseUri, notation, keepBNode, (triple, pos) -> pipe.addElement(triple.tripleToString()))); } - private static class TripleStringParser implements PipedCopyIterator.Parser { - private static final TripleStringParser INSTANCE = new TripleStringParser(); - @Override - public void write(TripleString tripleString, OutputStream stream) throws IOException { - PipedCopyIterator.Parser.writeString(tripleString.getSubject(), stream); - PipedCopyIterator.Parser.writeString(tripleString.getPredicate(), stream); - PipedCopyIterator.Parser.writeString(tripleString.getObject(), stream); - } - - @Override - public TripleString read(InputStream stream) throws IOException { - String s = PipedCopyIterator.Parser.readString(stream); - String p = PipedCopyIterator.Parser.readString(stream); - String o = PipedCopyIterator.Parser.readString(stream); - return new TripleString(s, p, o); - } - } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java new file mode 100644 index 00000000..59da86f4 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java @@ -0,0 +1,39 @@ +package org.rdfhdt.hdt.triples; + +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.util.Comparator; + +public class IndexedNode implements Comparable { + private static final Comparator NODE_COMPARATOR = CharSequenceComparator.getInstance(); + private CharSequence node; + private long index; + + public IndexedNode(CharSequence node, long index) { + this.node = node; + this.index = index; + } + public IndexedNode() { + } + + public CharSequence getNode() { + return node; + } + + public long getIndex() { + return index; + } + + public void setIndex(long index) { + this.index = index; + } + + public void setNode(CharSequence node) { + this.node = node; + } + + @Override + public int compareTo(IndexedNode o) { + return NODE_COMPARATOR.compare(node, o.getNode()); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedTriple.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedTriple.java new file mode 100644 index 00000000..07927398 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedTriple.java @@ -0,0 +1,43 @@ +package org.rdfhdt.hdt.triples; + +/** + * A triple of {@link org.rdfhdt.hdt.triples.IndexedNode} + * @author Antoine Willerval + */ +public class IndexedTriple { + private IndexedNode subject; + private IndexedNode predicate; + private IndexedNode object; + + public IndexedTriple() { + } + + public IndexedTriple(IndexedNode subject, IndexedNode predicate, IndexedNode object) { + load(subject, predicate, object); + } + + public IndexedNode getSubject() { + return subject; + } + + public IndexedNode getPredicate() { + return predicate; + } + + public IndexedNode getObject() { + return object; + } + + /** + * load a new s p o inside this triple + * @param subject the subject + * @param predicate the predicate + * @param object the object + */ + public void load(IndexedNode subject, IndexedNode predicate, IndexedNode object) { + this.subject = subject; + this.predicate = predicate; + this.object = object; + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TempTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TempTriples.java index 94af28ca..cdfd1106 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TempTriples.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TempTriples.java @@ -84,11 +84,6 @@ public interface TempTriples extends TriplesPrivate, Closeable { */ void setOrder(TripleComponentOrder order); - /** - * Gets the currently set order(TripleComponentOrder) - */ - TripleComponentOrder getOrder(); - /** * Clear all triples, resulting in an empty triples section. */ diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TriplesPrivate.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TriplesPrivate.java index 728a423b..2d570020 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TriplesPrivate.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TriplesPrivate.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.io.OutputStream; +import org.rdfhdt.hdt.enums.TripleComponentOrder; import org.rdfhdt.hdt.iterator.SuppliableIteratorTripleID; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; @@ -80,4 +81,9 @@ public interface TriplesPrivate extends Triples { * The TempTriples input to load from */ void load(TempTriples input, ProgressListener listener); + + /** + * Gets the currently set order(TripleComponentOrder) + */ + TripleComponentOrder getOrder(); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java index bfe186ac..34043379 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java @@ -49,10 +49,7 @@ import org.rdfhdt.hdt.iterator.SuppliableIteratorTripleID; import org.rdfhdt.hdt.iterator.SequentialSearchIteratorTripleID; import org.rdfhdt.hdt.listener.ProgressListener; -import org.rdfhdt.hdt.options.ControlInfo; -import org.rdfhdt.hdt.options.ControlInformation; -import org.rdfhdt.hdt.options.HDTOptions; -import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.options.*; import org.rdfhdt.hdt.triples.IteratorTripleID; import org.rdfhdt.hdt.triples.TempTriples; import org.rdfhdt.hdt.triples.TripleID; @@ -73,7 +70,7 @@ public class BitmapTriples implements TriplesPrivate { private static final Logger log = LoggerFactory.getLogger(BitmapTriples.class); - protected TripleComponentOrder order=TripleComponentOrder.SPO; + protected TripleComponentOrder order; protected Sequence seqY, seqZ, indexZ, predicateCount; protected Bitmap bitmapY, bitmapZ, bitmapIndexZ; @@ -90,9 +87,11 @@ public BitmapTriples() { } public BitmapTriples(HDTOptions spec) { - String orderStr = spec.get("triplesOrder"); - if(orderStr!=null) { - order = TripleComponentOrder.valueOf(orderStr); + String orderStr = spec.get(HDTOptionsKeys.TRIPLE_ORDER_KEY); + if(orderStr == null) { + this.order = TripleComponentOrder.SPO; + } else { + this.order = TripleComponentOrder.valueOf(orderStr); } bitmapY = BitmapFactory.createBitmap(spec.get("bitmap.y")); @@ -839,6 +838,7 @@ public void close() throws IOException { } } + @Override public TripleComponentOrder getOrder() { return this.order; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/OneReadTempTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/OneReadTempTriples.java new file mode 100644 index 00000000..f477bd8c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/OneReadTempTriples.java @@ -0,0 +1,243 @@ +package org.rdfhdt.hdt.triples.impl; + +import org.rdfhdt.hdt.dictionary.impl.DictionaryIDMapping; +import org.rdfhdt.hdt.enums.ResultEstimationType; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.iterator.SuppliableIteratorTripleID; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.triples.IteratorTripleID; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.Triples; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.compress.NoDuplicateTripleIDIterator; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Iterator; + +/** + * {@link org.rdfhdt.hdt.triples.TempTriples} only readable once with the {@link #searchAll()} method with a predefined + * order, trying to set another order will lead to an exception, trying to use any other method can lead to a + * {@link org.rdfhdt.hdt.exceptions.NotImplementedException}. + * @author Antoine Willerval + */ +public class OneReadTempTriples implements TempTriples { + private IteratorTripleID iterator; + private TripleComponentOrder order; + + public OneReadTempTriples(Iterator iterator, TripleComponentOrder order, long triples) { + this.iterator = new SimpleIteratorTripleID(iterator, order, triples); + this.order = order; + } + + @Override + public boolean insert(long subject, long predicate, long object) { + throw new NotImplementedException(); + } + + @Override + public boolean insert(TripleID... triples) { + throw new NotImplementedException(); + } + + @Override + public boolean remove(TripleID... pattern) { + throw new NotImplementedException(); + } + + @Override + public void sort(ProgressListener listener) { + // already sorted + } + + @Override + public void removeDuplicates(ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void setOrder(TripleComponentOrder order) { + if (order != this.order) { + throw new IllegalArgumentException("order asked by isn't the same as the set one!"); + } + } + + @Override + public void clear() { + throw new NotImplementedException(); + } + + @Override + public void load(Triples triples, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void replaceAllIds(DictionaryIDMapping mapSubj, DictionaryIDMapping mapPred, DictionaryIDMapping mapObj) { + throw new NotImplementedException(); + } + + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public SuppliableIteratorTripleID search(TripleID pattern) { + throw new NotImplementedException(); + } + + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void generateIndex(ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void loadIndex(InputStream input, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void mapIndex(CountInputStream input, File f, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void saveIndex(OutputStream output, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void load(TempTriples input, ProgressListener listener) { + if (input instanceof OneReadTempTriples) { + OneReadTempTriples input2 = (OneReadTempTriples) input; + this.iterator = input2.iterator; + this.order = input2.order; + } else { + throw new NotImplementedException(); + } + } + + @Override + public TripleComponentOrder getOrder() { + return order; + } + + @Override + public IteratorTripleID searchAll() { + return new NoDuplicateTripleIDIterator(iterator); + } + + @Override + public long getNumberOfElements() { + return iterator.estimatedNumResults(); + } + + @Override + public long size() { + return iterator.estimatedNumResults(); + } + + @Override + public void populateHeader(Header head, String rootNode) { + throw new NotImplementedException(); + } + + @Override + public String getType() { + throw new NotImplementedException(); + } + + @Override + public TripleID findTriple(long position) { + throw new NotImplementedException(); + } + + @Override + public void close() throws IOException { + // nothing to do + } + + private static class SimpleIteratorTripleID implements IteratorTripleID { + private final Iterator it; + private final TripleComponentOrder order; + private final long tripleCount; + + public SimpleIteratorTripleID(Iterator it, TripleComponentOrder order, long tripleCount) { + this.it = it; + this.order = order; + this.tripleCount = tripleCount; + } + + @Override + public boolean hasPrevious() { + throw new NotImplementedException(); + } + + @Override + public TripleID previous() { + throw new NotImplementedException(); + } + + @Override + public void goToStart() { + throw new NotImplementedException(); + } + + @Override + public boolean canGoTo() { + throw new NotImplementedException(); + } + + @Override + public void goTo(long pos) { + throw new NotImplementedException(); + } + + @Override + public long estimatedNumResults() { + return tripleCount; + } + + @Override + public ResultEstimationType numResultEstimation() { + return ResultEstimationType.UP_TO; + } + + @Override + public TripleComponentOrder getOrder() { + return order; + } + + @Override + public long getLastTriplePosition() { + return tripleCount; + } + + @Override + public boolean hasNext() { + return it.hasNext(); + } + + @Override + public TripleID next() { + return it.next(); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesList.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesList.java index 1adcdae1..015c7af4 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesList.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesList.java @@ -41,6 +41,7 @@ import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.*; import org.rdfhdt.hdt.util.RDFInfo; import org.rdfhdt.hdt.util.io.CountInputStream; @@ -77,11 +78,12 @@ public TriplesList(HDTOptions specification) { this.arrayOfTriples = new ArrayList((int)numTriples); //choosing starting(or default) component order - String orderStr = specification.get("triplesOrder"); - if(orderStr==null) { - orderStr = "SPO"; + String orderStr = specification.get(HDTOptionsKeys.TRIPLE_ORDER_KEY); + if(orderStr == null) { + this.order = TripleComponentOrder.SPO; + } else { + this.order = TripleComponentOrder.valueOf(orderStr); } - this.order = TripleComponentOrder.valueOf(orderStr); this.numValidTriples = 0; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesListLong.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesListLong.java index 3a06d9c8..84ef75f5 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesListLong.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesListLong.java @@ -46,6 +46,7 @@ import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.IteratorTripleID; import org.rdfhdt.hdt.triples.TempTriples; import org.rdfhdt.hdt.triples.TripleID; @@ -86,11 +87,12 @@ public TriplesListLong(HDTOptions specification) { this.arrayOfTriples = new ArrayList((int)numTriples); //choosing starting(or default) component order - String orderStr = specification.get("triplesOrder"); - if(orderStr==null) { - orderStr = "SPO"; + String orderStr = specification.get(HDTOptionsKeys.TRIPLE_ORDER_KEY); + if(orderStr == null) { + this.order = TripleComponentOrder.SPO; + } else { + this.order = TripleComponentOrder.valueOf(orderStr); } - this.order = TripleComponentOrder.valueOf(orderStr); this.numValidTriples = 0; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/WriteBitmapTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/WriteBitmapTriples.java new file mode 100644 index 00000000..13cde73f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/WriteBitmapTriples.java @@ -0,0 +1,252 @@ +package org.rdfhdt.hdt.triples.impl; + +import org.rdfhdt.hdt.compact.bitmap.AppendableWriteBitmap; +import org.rdfhdt.hdt.compact.sequence.SequenceLog64BigDisk; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.exceptions.IllegalFormatException; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.iterator.SuppliableIteratorTripleID; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.triples.IteratorTripleID; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.TriplesPrivate; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; + +/** + * Appendable write {@link org.rdfhdt.hdt.triples.impl.BitmapTriples} version + * + * @author Antoine Willerval + */ +public class WriteBitmapTriples implements TriplesPrivate { + protected TripleComponentOrder order; + private long numTriples; + private final AppendableWriteBitmap bitY, bitZ; + private final CloseSuppressPath seqY, seqZ, triples; + private SequenceLog64BigDisk vectorY, vectorZ; + + public WriteBitmapTriples(HDTOptions spec, CloseSuppressPath triples, int bufferSize) throws IOException { + String orderStr = spec.get(HDTOptionsKeys.TRIPLE_ORDER_KEY); + if(orderStr == null) { + this.order = TripleComponentOrder.SPO; + } else { + this.order = TripleComponentOrder.valueOf(orderStr); + } + triples.mkdirs(); + triples.closeWithDeleteRecurse(); + this.triples = triples; + bitY = new AppendableWriteBitmap(triples.resolve("bitmapY"), bufferSize); + bitZ = new AppendableWriteBitmap(triples.resolve("bitmapZ"), bufferSize); + seqY = triples.resolve("seqY"); + seqZ = triples.resolve("seqZ"); + } + + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + ci.clear(); + ci.setFormat(getType()); + ci.setInt("order", order.ordinal()); + ci.setType(ControlInfo.Type.TRIPLES); + ci.save(output); + + IntermediateListener iListener = new IntermediateListener(listener); + bitY.save(output, iListener); + bitZ.save(output, iListener); + vectorY.save(output, iListener); + vectorZ.save(output, iListener); + } + + @Override + public IteratorTripleID searchAll() { + throw new NotImplementedException(); + } + + @Override + public SuppliableIteratorTripleID search(TripleID pattern) { + throw new NotImplementedException(); + } + + @Override + public long getNumberOfElements() { + return numTriples; + } + + @Override + public long size() { + return numTriples * 4; + } + + @Override + public void populateHeader(Header header, String rootNode) { + if (rootNode == null || rootNode.length() == 0) { + throw new IllegalArgumentException("Root node for the header cannot be null"); + } + + header.insert(rootNode, HDTVocabulary.TRIPLES_TYPE, getType()); + header.insert(rootNode, HDTVocabulary.TRIPLES_NUM_TRIPLES, getNumberOfElements()); + header.insert(rootNode, HDTVocabulary.TRIPLES_ORDER, order.toString()); +// header.insert(rootNode, HDTVocabulary.TRIPLES_SEQY_TYPE, seqY.getType() ); +// header.insert(rootNode, HDTVocabulary.TRIPLES_SEQZ_TYPE, seqZ.getType() ); +// header.insert(rootNode, HDTVocabulary.TRIPLES_SEQY_SIZE, seqY.size() ); +// header.insert(rootNode, HDTVocabulary.TRIPLES_SEQZ_SIZE, seqZ.size() ); +// if(bitmapY!=null) { +// header.insert(rootNode, HDTVocabulary.TRIPLES_BITMAPY_SIZE, bitmapY.getSizeBytes() ); +// } +// if(bitmapZ!=null) { +// header.insert(rootNode, HDTVocabulary.TRIPLES_BITMAPZ_SIZE, bitmapZ.getSizeBytes() ); +// } + } + + @Override + public String getType() { + return HDTVocabulary.TRIPLES_TYPE_BITMAP; + } + + @Override + public TripleID findTriple(long position) { + throw new NotImplementedException(); + } + + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void generateIndex(ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void loadIndex(InputStream input, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void mapIndex(CountInputStream input, File f, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void saveIndex(OutputStream output, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void load(TempTriples triples, ProgressListener listener) { + triples.setOrder(order); + triples.sort(listener); + + IteratorTripleID it = triples.searchAll(); + + long number = it.estimatedNumResults(); + + vectorY = new SequenceLog64BigDisk(seqY.toAbsolutePath().toString(), BitUtil.log2(number)); + vectorZ = new SequenceLog64BigDisk(seqZ.toAbsolutePath().toString(), BitUtil.log2(number)); + + long lastX = 0, lastY = 0, lastZ = 0; + long x, y, z; + numTriples = 0; + + while (it.hasNext()) { + TripleID triple = it.next(); + TripleOrderConvert.swapComponentOrder(triple, TripleComponentOrder.SPO, order); + + x = triple.getSubject(); + y = triple.getPredicate(); + z = triple.getObject(); + if (x == 0 || y == 0 || z == 0) { + throw new IllegalFormatException("None of the components of a triple can be null"); + } + + if (numTriples == 0) { + // First triple + vectorY.append(y); + vectorZ.append(z); + } else if (x != lastX) { + if (x != lastX + 1) { + throw new IllegalFormatException("Upper level must be increasing and correlative."); + } + // X changed + bitY.append(true); + vectorY.append(y); + + bitZ.append(true); + vectorZ.append(z); + } else if (y != lastY) { + if (y < lastY) { + throw new IllegalFormatException("Middle level must be increasing for each parent."); + } + + // Y changed + bitY.append(false); + vectorY.append(y); + + bitZ.append(true); + vectorZ.append(z); + } else { + if (z < lastZ) { + throw new IllegalFormatException("Lower level must be increasing for each parent."); + } + + // Z changed + bitZ.append(false); + vectorZ.append(z); + } + + lastX = x; + lastY = y; + lastZ = z; + + ListenerUtil.notifyCond(listener, "Converting to BitmapTriples", numTriples, numTriples, number); + numTriples++; + } + + if (numTriples > 0) { + bitY.append(true); + bitZ.append(true); + } + + vectorY.aggressiveTrimToSize(); + vectorZ.aggressiveTrimToSize(); + } + + @Override + public TripleComponentOrder getOrder() { + return order; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll( + bitY, + bitZ, + vectorY, + seqY, + vectorZ, + seqZ, + triples + ); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/BitUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/BitUtil.java index 8353d865..f7882be5 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/BitUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/BitUtil.java @@ -44,7 +44,7 @@ public static int log2(long n) { } public static long maxVal(int numbits) { - return ~(~0L< list type + * @author Antoine Willerval + */ +public class ParallelSortableArrayList implements List { + public static final double GROW_FACTOR = 1.5f; + private int used; + private T[] array; + private final Class type; + + public ParallelSortableArrayList(Class type) { + this(type, 16); + } + + @SuppressWarnings("unchecked") + public ParallelSortableArrayList(Class type, int capacity) { + this.type = type; + array = (T[]) Array.newInstance(type.getComponentType(), capacity); + } + + private void checkSize(int newSize) { + if (newSize >= array.length) { + // don't allocate beyond the max size + int allocate = (int) Math.min(Integer.MAX_VALUE - 5L, (long) (newSize * GROW_FACTOR)); + array = Arrays.copyOf(array, allocate, type); + } + } + + @Override + public boolean add(T element) { + checkSize(used + 1); + array[used++] = element; + return true; + } + + @Override + public boolean remove(Object o) { + throw new NotImplementedException(); + } + + @Override + public boolean containsAll(Collection c) { + throw new NotImplementedException(); + } + + @Override + public boolean addAll(Collection c) { + throw new NotImplementedException(); + } + + @Override + public boolean addAll(int index, Collection c) { + throw new NotImplementedException(); + } + + @Override + public boolean removeAll(Collection c) { + throw new NotImplementedException(); + } + + @Override + public boolean retainAll(Collection c) { + throw new NotImplementedException(); + } + + @Override + public int size() { + return used; + } + + @Override + public boolean isEmpty() { + return size() == 0; + } + + @Override + public boolean contains(Object o) { + throw new NotImplementedException(); + } + + @Override + public void clear() { + for (int i = 0; i < used; i++) { + array[i] = null; + } + used = 0; + } + + @Override + public T get(int index) { + return array[index]; + } + + @Override + public T set(int index, T element) { + return array[index] = element; + } + + @Override + public void add(int index, T element) { + throw new NotImplementedException(); + } + + @Override + public T remove(int index) { + throw new NotImplementedException(); + } + + @Override + public int indexOf(Object o) { + for (int i = 0; i < size(); i++) { + if (get(i).equals(o)) { + return i; + } + } + return -1; + } + + @Override + public int lastIndexOf(Object o) { + for (int i = size() - 1; i >= 0; i--) { + if (get(i).equals(o)) { + return i; + } + } + return -1; + } + + @Override + public ListIterator listIterator() { + throw new NotImplementedException(); + } + + @Override + public ListIterator listIterator(int index) { + throw new NotImplementedException(); + } + + @Override + public List subList(int fromIndex, int toIndex) { + throw new NotImplementedException(); + } + + public T[] getArray() { + return array; + } + + @Override + public Iterator iterator() { + return Arrays.asList(array).subList(0, used).iterator(); + } + + @Override + public Object[] toArray() { + return Arrays.copyOf(array, used, Object[].class); + } + + @Override + public T1[] toArray(T1[] a) { + throw new NotImplementedException(); + } + + @Override + public void sort(Comparator comparator) { + Arrays.sort(array, 0, used, comparator); + } + + /** + * sort this array in parallel (if available) + * @param comparator sort comparator + */ + public void parallelSort(Comparator comparator) { + Arrays.parallelSort(array, 0, used, comparator); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java new file mode 100644 index 00000000..686b21c7 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java @@ -0,0 +1,155 @@ +package org.rdfhdt.hdt.util; + +import java.util.ArrayList; +import java.util.List; + +/** + * tool to profile time + * @author Antoine Willerval + */ +public class Profiler { + private int maxSize = 0; + private final String name; + private Section mainSection; + private boolean disabled; + + /** + * create a profiler + * @param name the profiler name + */ + public Profiler(String name) { + this.name = name; + } + + /** + * disable the profiler methods + * @param disable if true, the methods will be callable, but won't do anything + */ + public void setDisabled(boolean disable) { + this.disabled = disable; + } + + /** + * start a section + * @param name the section name + */ + public void pushSection(String name) { + if (disabled) { + return; + } + getMainSection().pushSection(name, 0); + } + + /** + * complete a section + */ + public void popSection() { + if (disabled) { + return; + } + if (!getMainSection().isRunning()) { + throw new IllegalArgumentException("profiler not running!"); + } + getMainSection().popSection(); + } + + /** + * stop the profiler without poping sections + */ + public void stop() { + if (disabled) { + return; + } + getMainSection().stop(); + } + + /** + * write the profile into the console + */ + public void writeProfiling() { + if (disabled) { + return; + } + getMainSection().writeProfiling("", true); + } + + /** + * @return the main section of the profiler + */ + public Section getMainSection() { + if (this.mainSection == null) { + this.mainSection = new Section(name); + } + return this.mainSection; + } + + /** + * a section in the profiling + */ + public class Section { + private final String name; + private final long start = System.nanoTime(); + private long end = start; + private final List
subSections = new ArrayList<>(); + private Section currentSection; + + Section(String name) { + this.name = name; + } + + /** + * @return the subsections + */ + public List
getSubSections() { + return subSections; + } + + /** + * @return the section name + */ + public String getName() { + return name; + } + + boolean isRunning() { + return currentSection != null; + } + + void pushSection(String name, int deep) { + if (isRunning()) { + currentSection.pushSection(name, deep + 1); + return; + } + + subSections.add(currentSection = new Section(name)); + maxSize = Math.max(name.length() + deep * 2, maxSize); + } + + boolean popSection() { + if (isRunning()) { + if (currentSection.popSection()) { + currentSection = null; + } + return false; + } else { + end = System.nanoTime(); + return true; + } + } + + void stop() { + if (isRunning()) { + currentSection.stop(); + } + end = System.nanoTime(); + } + + void writeProfiling(String prefix, boolean isLast) { + System.out.println(prefix + (getSubSections().isEmpty() ? "+--" : "+-+") + " [" + getName() + "] " + "-".repeat(1 + maxSize - getName().length()) + " elapsed=" + (end - start) / 1_000_000L + "ms"); + for (int i = 0; i < subSections.size(); i++) { + Section s = subSections.get(i); + s.writeProfiling(prefix + (isLast ? " " : "| "), i == subSections.size() - 1); + } + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Reference.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Reference.java new file mode 100644 index 00000000..91111882 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Reference.java @@ -0,0 +1,60 @@ +package org.rdfhdt.hdt.util; + +import java.util.function.Supplier; + +/** + * Simple object reference + * @param type of the object + */ +public class Reference { + private T object; + + /** + * create with an object + * @param object the object + */ + public Reference(T object) { + this.object = object; + } + + /** + * create with a null object + */ + public Reference() { + this(null); + } + + /** + * set the object + * @param object the object + */ + public void setObject(T object) { + this.object = object; + } + + /** + * @return the object + */ + public T getObject() { + return object; + } + + /** + * @return if the object is null + */ + public boolean isNull() { + return object == null; + } + + /** + * compute the object if it is null and return the objec + * @param compute the compute function + * @return the object + */ + public T computeIfAbsent(Supplier compute) { + if (isNull()) { + setObject(compute.get()); + } + return getObject(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionFunction.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionFunction.java new file mode 100644 index 00000000..3983b756 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionFunction.java @@ -0,0 +1,6 @@ +package org.rdfhdt.hdt.util.concurrent; + +@FunctionalInterface +public interface ExceptionFunction { + O apply(I value) throws E; +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionSupplier.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionSupplier.java new file mode 100644 index 00000000..02ba145f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionSupplier.java @@ -0,0 +1,6 @@ +package org.rdfhdt.hdt.util.concurrent; + +@FunctionalInterface +public interface ExceptionSupplier { + T get() throws E; +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/HeightTree.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/HeightTree.java new file mode 100644 index 00000000..8526e526 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/HeightTree.java @@ -0,0 +1,97 @@ +package org.rdfhdt.hdt.util.concurrent; + +import java.util.ArrayList; +import java.util.List; + +/** + * data structure to store chunk with a height, this class isn't thread safe + * + * @param tree type + * @author Antoine Willerval + */ +public class HeightTree { + private final List> elements = new ArrayList<>(); + private int size = 0; + + /** + * get at least minNumber from the same height + * + * @param minNumber minNumber the minimum element count to get + * @return list containing the elements or null if at least minNumber elements can't be get + */ + public List getMax(int minNumber) { + for (int i = 0; i < elements.size(); i++) { + List list = elements.get(i); + if (list.size() == minNumber) { + // get all the elements + elements.set(i, new ArrayList<>()); + size -= list.size(); + return list; + } else if (list.size() > minNumber) { + // remove the last maxNumber elements (no copy) + List list1 = new ArrayList<>(); + int count = list.size() - 1; + for (int j = 0; j < minNumber; j++) { + list1.add(list.get(count - j)); + list.remove(count - j); + } + size -= list1.size(); + return list1; + } + } + return null; + } + + /** + * get at most maxNumber from the bottom height + * + * @param maxNumber the maximum element to get + * @return list containing the elements + */ + public List getAll(int maxNumber) { + List list = new ArrayList<>(); + + int count = maxNumber; + + for (List l : elements) { + int toGet = Math.min(l.size(), count); + int n = l.size() - 1; + for (int j = 0; j < toGet; j++) { + list.add(l.get(n - j)); + l.remove(n - j); + --count; + } + if (count == 0) { + break; + } + } + + size -= list.size(); + return list; + } + + /** + * add an element to the tree + * + * @param element element + * @param height height + */ + public void addElement(E element, int height) { + if (height >= elements.size()) { + // ensure size + for (int i = elements.size(); i <= height; i++) { + elements.add(new ArrayList<>()); + } + } + + size++; + elements.get(height).add(element); + } + + /** + * @return the number of elements in the tree + */ + public int size() { + return size; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/KWayMerger.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/KWayMerger.java new file mode 100644 index 00000000..3a0bda82 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/KWayMerger.java @@ -0,0 +1,302 @@ +package org.rdfhdt.hdt.util.concurrent; + +import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * Object to perform k-Way-Merge using an {@link AsyncIteratorFetcher} + * + * @param merge object type + * @author Antoine Willerval + */ +public class KWayMerger> { + private static final AtomicInteger ID = new AtomicInteger(); + private final int k; + private final AsyncIteratorFetcher iteratorFetcher; + private final KWayMergerImpl impl; + private final Worker[] workers; + private final AtomicLong pathId = new AtomicLong(); + private final CloseSuppressPath workLocation; + private final Lock dataLock = new ReentrantLock(); + private boolean started; + private boolean end; + private final HeightTree chunks = new HeightTree<>(); + private Throwable throwable; + + /** + * kwaymerger + * + * @param workLocation location to store the chunks + * @param syncSupplier the element supplier + * @param impl implementation of {@link KWayMergerImpl} to create/handle the chunks + * @param workers the number of workers + * @param k the k in the k-way merge + */ + public KWayMerger(CloseSuppressPath workLocation, AsyncIteratorFetcher syncSupplier, KWayMergerImpl impl, int workers, int k) throws KWayMergerException { + this.workLocation = workLocation; + this.iteratorFetcher = syncSupplier; + this.impl = impl; + this.k = k; + + try { + workLocation.mkdirs(); + } catch (IOException e) { + throw new KWayMergerException("Can't create workLocation directory!", e); + } + + this.workers = new Worker[workers]; + int id = ID.incrementAndGet(); + for (int i = 0; i < workers; i++) { + this.workers[i] = new Worker("KWayMerger#" + id + "Worker#" + i, this); + } + } + + /** + * start all the workers + */ + public void start() { + if (started) { + throw new IllegalArgumentException("The KWayMerger was already started and can't be reused!"); + } + started = true; + for (Worker w : workers) { + w.start(); + } + } + + private void exception(Throwable t) { + if (throwable != null) { + throwable.addSuppressed(t); + } else { + throwable = t; + } + for (Worker w : workers) { + w.interrupt(); + } + } + + /** + * wait the result and return it (if any), this method isn't thread safe and can't be called twice + * + * @return optional of the result + * @throws InterruptedException wait interupption + * @throws KWayMergerException exception while merging + */ + public Optional waitResult() throws InterruptedException, KWayMergerException { + if (!started) { + throw new IllegalArgumentException("The KWayMerger hasn't been started!"); + } + for (Worker w : workers) { + w.join(); + } + if (throwable != null) { + if (throwable instanceof Error) { + throw (Error) throwable; + } + if (throwable instanceof RuntimeException) { + throw (RuntimeException) throwable; + } + if (throwable instanceof KWayMergerException) { + throw (KWayMergerException) throwable; + } + throw new KWayMergerException(throwable); + } + + if (chunks.size() > 1) { + throw new KWayMergerException("Chunk size is above 1! " + chunks.size()); + } + + List all = chunks.getAll(1); + return all.isEmpty() ? Optional.empty() : Optional.of(all.get(0).getPath()); + } + + @FunctionalInterface + private interface KWayMergerRunnable { + void run() throws KWayMergerException; + } + + /** + * implementation to handle the chunks + * + * @param chunk types + */ + public interface KWayMergerImpl> { + /** + * create a chunk from a flux, the flux is the returned by {@link #newStopFlux(Supplier)} + * + * @param flux flux to handle + * @param output output to write the chunk + * @throws KWayMergerException any exception returned by this method's implementation + */ + void createChunk(S flux, CloseSuppressPath output) throws KWayMergerException; + + /** + * merge chunks together into a new chunk + * + * @param inputs the chunks + * @param output the output chunk + * @throws KWayMergerException any exception returned by this method's implementation + */ + void mergeChunks(List inputs, CloseSuppressPath output) throws KWayMergerException; + + /** + * create a flux from another one to tell when to stop + * + * @param flux the flux + * @return the new flux + * @throws KWayMergerException any exception returned by this method's implementation + */ + S newStopFlux(Supplier flux) throws KWayMergerException; + } + + /** + * @return a unique path into the work location + */ + private CloseSuppressPath getPath() { + return workLocation.resolve("f-" + pathId.incrementAndGet()); + } + + /** + * @return thread safe method to get a method to handle or null if no other tasks are required + */ + private KWayMergerRunnable getTask() { + dataLock.lock(); + try { + if (end) { + if (chunks.size() <= 1) { + return null; + } + + List all = chunks.getAll(k); + + return new MergeTask(all); + } + + List chunkList = chunks.getMax(k); + + if (chunkList != null) { + return new MergeTask(chunkList); + } + + return new GetTask(); + } finally { + dataLock.unlock(); + } + } + + private class MergeTask implements KWayMergerRunnable { + private final List chunks; + + public MergeTask(List chunks) { + assert !chunks.isEmpty() : "empty chunks"; + this.chunks = chunks; + } + + @Override + public void run() throws KWayMergerException { + int chunk = chunks.stream().mapToInt(Chunk::getHeight).max().orElseThrow() + 1; + CloseSuppressPath mergec = getPath(); + List paths = chunks.stream().map(Chunk::getPath).collect(Collectors.toUnmodifiableList()); + impl.mergeChunks(paths, mergec); + try { + IOUtil.closeAll(paths); + } catch (IOException e) { + throw new KWayMergerException("Can't close end merge files", e); + } + dataLock.lock(); + try { + KWayMerger.this.chunks.addElement(new Chunk(chunk, mergec), chunk); + } finally { + dataLock.unlock(); + } + } + } + + private class GetTask implements KWayMergerRunnable { + + @Override + public void run() throws KWayMergerException { + CloseSuppressPath chunk = getPath(); + S flux = impl.newStopFlux(iteratorFetcher); + impl.createChunk(flux, chunk); + dataLock.lock(); + try { + end = iteratorFetcher.isEnd(); + Chunk newChunk = new Chunk(1, chunk); + chunks.addElement(newChunk, newChunk.getHeight()); + } finally { + dataLock.unlock(); + } + } + } + + private static class Chunk { + private final int height; + private final CloseSuppressPath path; + + public Chunk(int height, CloseSuppressPath path) { + this.height = height; + this.path = path; + } + + public int getHeight() { + return height; + } + + public CloseSuppressPath getPath() { + return path; + } + } + + private static class Worker extends Thread { + private final KWayMerger parent; + + public Worker(String name, KWayMerger parent) { + super(name); + this.parent = parent; + } + + @Override + public void run() { + try { + KWayMergerRunnable task; + + while (!isInterrupted() && (task = parent.getTask()) != null) { + task.run(); + } + } catch (Throwable t) { + parent.exception(t); + } + } + } + + /** + * Exception linked with the {@link KWayMerger}, this class isn't a {@link RuntimeException} because these exceptions should be seriously considered + * + * @author Antoine Willerval + */ + public static class KWayMergerException extends Exception { + public KWayMergerException(String message) { + super(message); + } + + public KWayMergerException(String message, Throwable cause) { + super(message, cause); + } + + public KWayMergerException(Throwable cause) { + super(cause); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncListener.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncListener.java new file mode 100644 index 00000000..1fa90b2c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncListener.java @@ -0,0 +1,28 @@ +package org.rdfhdt.hdt.util.concurrent; + +import org.rdfhdt.hdt.listener.ProgressListener; + +/** + * {@link org.rdfhdt.hdt.listener.ProgressListener} wrapper to allow multiple thread to notify a progression + * @author Antoine Willerval + */ +public class SyncListener implements ProgressListener { + /** + * create a sync listener from another progress listener + * @param listener listener to sync, if it is null, this method returns null + * @return sync version of listener, or null if listener is null + */ + public static ProgressListener of(ProgressListener listener) { + return listener instanceof SyncListener || listener == null ? listener : new SyncListener(listener); + } + private final ProgressListener wrapper; + + private SyncListener(ProgressListener wrapper) { + this.wrapper = wrapper; + } + + @Override + public synchronized void notifyProgress(float level, String message) { + wrapper.notifyProgress(level, message); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/TreeWorker.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/TreeWorker.java new file mode 100644 index 00000000..dcd8232c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/TreeWorker.java @@ -0,0 +1,699 @@ +package org.rdfhdt.hdt.util.concurrent; + + +import org.rdfhdt.hdt.listener.MultiThreadListener; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.IntFunction; + +/** + * a worker to parse tree operation + * @param the type used in the tree to supply + * @param the type used in the tree + * @author Antoine Willerval + */ +public class TreeWorker { + /** + * ID fetcher for the workers + */ + private static final AtomicInteger JOB_ID_NAME = new AtomicInteger(); + + /** + * Sync object for the FETCH operation + */ + private final Object FETCH_SYNC = new Object() { + }; + /** + * Sync object for waiting for new job + */ + private final Object WAITING_SYNC = new Object() { + }; + /** + * Sync object to show the current count of workers in the ProgressListener + */ + private final Object WORKING_SYNC = new Object() { + }; + /** + * Cat function (T[]) -> T + */ + private final TreeWorkerCat catFunction; + /** + * Supplier Function () -> S + */ + private final TreeWorkerSupplier baseLevelSupplier; + /** + * Map Function (S) -> T + */ + private final TreeWorkerMap mapFunction; + /** + * Delete Function (T) -> void + */ + private final TreeWorkerDelete delete; + /** + * Function to create array of type T + */ + private final IntFunction arrayBuilder; + /** + * the current maximum level of the elements + */ + private int maxLevel = 0; + /** + * the count of workers waiting for a job + */ + private int workerWaiting = 0; + /** + * the count of working workers + */ + private int workerWorking; + /** + * the minimum number of elements to merge when the supplying phase isn't completed + */ + private final int treeCount; + /** + * the mapped elements waiting for a merge (T[]) + */ + private final List elements = new ArrayList<>(); + /** + * the supplied elements waiting for a map (S[]) + */ + private final List suppliedElements = new ArrayList<>(); + /** + * the worker threads + */ + private final List workers; + /** + * if the TreeWorker is started + */ + private boolean started = false; + /** + * if the fetch phase is completed + */ + private boolean fetchDone = false; + /** + * if the map phase is completed + */ + private boolean mapDone = false; + /** + * any throwable returned by the TreeWorker + */ + private TreeWorkerException throwable; + /** + * the progress listener + */ + private MultiThreadListener listener; + + /** + * create a tree worker + * @param catFunction the function to cat 2 nodes + * @param baseLevelSupplier the supplier to get base nodes + * @param delete the delete method to delete data in case of error, can be null if no delete is required + * @param arrayBuilder method to create an array of type T + * @throws TreeWorkerException if the tree worker can't be created + * @throws java.lang.NullPointerException if catFunction or baseLevelSupplier is null + */ + public TreeWorker(TreeWorkerCat catFunction, TreeWorkerSupplier baseLevelSupplier, TreeWorkerDelete delete, TreeWorkerMap mapFunction, IntFunction arrayBuilder) throws TreeWorkerException { + this(catFunction, baseLevelSupplier, delete, mapFunction, arrayBuilder, Runtime.getRuntime().availableProcessors(), 1); + } + + /** + * create a tree worker + * @param workerObject the worker object + * @param arrayBuilder method to create an array of type T + * @param workers the number of workers to use + * @param nodePerMerge number of simultaneous merge tree (at least 1) + * @throws TreeWorkerException if the tree worker can't be created + * @throws java.lang.NullPointerException if catFunction or baseLevelSupplier is null + */ + public & TreeWorkerSupplier & TreeWorkerDelete & TreeWorkerMap> TreeWorker(E workerObject, IntFunction arrayBuilder, int workers, int nodePerMerge) throws TreeWorkerException { + this(workerObject, workerObject, workerObject, workerObject, arrayBuilder, workers, nodePerMerge); + } + /** + * create a tree worker + * @param catFunction the function to cat 2 nodes + * @param baseLevelSupplier the supplier to get base nodes + * @param delete the delete method to delete data in case of error, can be null if no delete is required + * @param mapFunction the map function + * @param arrayBuilder method to create an array of type T + * @param workers the number of workers to use + * @param nodePerMerge number of simultaneous merge tree (at least 1) + * @throws TreeWorkerException if the tree worker can't be created + * @throws java.lang.NullPointerException if catFunction or baseLevelSupplier is null + */ + public TreeWorker(TreeWorkerCat catFunction, TreeWorkerSupplier baseLevelSupplier, TreeWorkerDelete delete, TreeWorkerMap mapFunction, IntFunction arrayBuilder, int workers, int nodePerMerge) throws TreeWorkerException { + this.catFunction = Objects.requireNonNull(catFunction, "catFunction can't be null!"); + this.mapFunction = Objects.requireNonNull(mapFunction, "mapFunction can't be null!"); + this.baseLevelSupplier = Objects.requireNonNull(baseLevelSupplier, "baseLevelSupplier can't be null!"); + this.arrayBuilder = Objects.requireNonNull(arrayBuilder, "arrayBuilder can't be null!"); + if (delete == null) { + this.delete = (t) -> {}; + } else { + this.delete = delete; + } + if (nodePerMerge <= 0) { + throw new TreeWorkerException("nodePerMerge count can't be <= 0!"); + } + treeCount = 1 << nodePerMerge; + if (workers <= 0) { + throw new TreeWorkerException("worker count can't be <= 0!"); + } + S s = baseLevelSupplier.get(); + if (s == null) { + throw new TreeWorkerException("no base element!"); + } + suppliedElements.add(s); + this.workers = new ArrayList<>(workers); + for (int i = 0; i < workers; i++) { + this.workers.add(new Worker()); + } + workerWorking = workers; + } + + /** + * create a generic array T[] of a size + * @param size the size + * @return the array + */ + private T[] createArray(int size) { + T[] array = arrayBuilder.apply(size); + assert array != null && array.length >= size : "array function should create an array with a size of a least size"; + return array; + } + + /** + * set a listener for each worker + * @param listener the listener + */ + public void setListener(MultiThreadListener listener) { + this.listener = listener; + } + + /** + * Start the workers + */ + public void start() { + synchronized (elements) { + if (started) { + throw new IllegalArgumentException("TreeWorker already started!"); + } + for (Worker worker : this.workers) { + worker.start(); + } + started = true; + } + } + + /** + * delete all the elements + */ + private void clearData() { + for (Element e: elements) { + delete.delete(e.mappedValue); + } + } + + /** + * wait for the tree worker to complete + * @return the last element + * @throws TreeWorkerException if an error occurred in a worker + * @throws InterruptedException in case of interruption + */ + public T waitToComplete() throws TreeWorkerException, InterruptedException { + try { + if (listener != null) { + synchronized (WORKING_SYNC) { + while (workerWorking > 0) { + listener.notifyProgress(100F * (workers.size() - workerWorking) / workers.size(), "waiting for workers to complete " + (workers.size() - workerWorking) + "/" + workers.size()); + WORKING_SYNC.wait(); + } + } + } + for (Worker w: workers) { + w.join(); + } + + if (listener != null) { + listener.notifyProgress(100, "tree completed"); + } + } catch (InterruptedException e) { + clearData(); + throw e; + } + + if (throwable != null) { + clearData(); + throw throwable; + } + + if (!fetchDone || !mapDone) { + clearData(); + // shouldn't be possible? + throw new TreeWorkerException("The worker isn't done!"); + } + if (elements.isEmpty()) { + return null; + } + return elements.get(0).mappedValue; + } + + private int countBase() { + return suppliedElements.size(); + } + + /** + * map function to map an element to another + * @param old type + * @param new type + * @author Antoine Willerval + */ + public interface TreeWorkerMap { + /** + * create an identity map function + * @param the type + * @return map function + */ + static TreeWorkerMap identity() { + return t -> t; + } + /** + * map the value + * @param prev the previous value + * @return the new value + */ + E map(T prev); + } + + /** + * cat function to merge two elements + * @param the elements type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface TreeWorkerCat { + /** + * construct an element from elements + * @param element the array of elements. + * @param count the number of elements in the array, from index 0 (inclusive) to count (exclusive) + * @return the cat of the 2 elements + */ + T construct(T[] element, int count); + } + /** + * delete function in case of error + * @param the elements type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface TreeWorkerDelete { + /** + * delete an unused element + * @param e the element to delete + */ + void delete(T e); + } + /** + * supply function + * @param the elements type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface TreeWorkerSupplier { + /** + * supply an element to merge + * @return the element to merge + */ + S get(); + } + + /** + * Interface containing all the TreeWorker function to implement + * @param Supplying type + * @param Mapped type + * @author Antoine Willerval + */ + public interface TreeWorkerObject extends TreeWorkerCat, TreeWorkerSupplier, TreeWorkerDelete, TreeWorkerMap { + } + /** + * Interface containing all the TreeWorker function to implement without the map operation + * @param type + * @author Antoine Willerval + */ + public interface TreeWorkerObjectNoMap extends TreeWorkerObject { + @Override + default T map(T prev) { + return prev; + } + } + + /** + * @return if the worker is completed + */ + public boolean isCompleted() { + synchronized (elements) { + return (fetchDone && mapDone && elements.size() <= 1) || throwable != null; + } + } + + private class Element { + T mappedValue; + int level; + + public Element(T mappedValue, int level) { + this.mappedValue = mappedValue; + this.level = level; + } + } + + private class Tuple { + Element first; + T[] elements; + int count; + int level; + Tuple() { + elements = createArray(treeCount); + clear(); + } + + /** + * add an element to this tuple + * @param e the element + */ + public void addElement(Element e) { + if (count == 0) { + first = e; + level = e.level; + } + elements[count++] = e.mappedValue; + assert level == e.level : "add from different level"; + } + + /** + * @return the first element added since the last tuple reset/creation + */ + public Element getFirstElement() { + return first; + } + + /** + * remove all the elements from the tree worker elements + * @throws TreeWorkerException if an element can't be removed + */ + public void remove() throws TreeWorkerException { + for (int i = 0; i < count; i++) { + removeFirst(elements[i]); + } + } + + private void removeFirst(T element) throws TreeWorkerException { + Iterator it = TreeWorker.this.elements.iterator(); + while (it.hasNext()) { + Element e = it.next(); + if (e.mappedValue == element && e.level == level) { + it.remove(); + return; + } + } + throw new TreeWorkerException("Can't remove an elements! " + element); + } + + /** + * @return the internal array inside, at least the size returned by {@link #size()} + */ + public T[] getArray() { + return elements; + } + + /** + * @return the count of elements + */ + public int size() { + return count; + } + + /** + * reset the tuple + */ + public void clear() { + this.count = 0; + } + + /** + * get a element in a particular index + * @param index the index + * @return the element + */ + public T get(int index) { + return elements[index]; + } + + private int searchDir(int start, int direction, int min) { + if (direction < 0) { + for (int i = start; i >= 0; i--) { + searchAtLevel(i); + if (size() >= min) { + return i; + } + } + } else { + for (int i = start; i <= maxLevel; i++) { + searchAtLevel(i); + if (size() >= min) { + return i; + } + } + } + return -1; + } + + private void searchAtLevel(int level) { + clear(); + synchronized (TreeWorker.this.elements) { + for (Element e: TreeWorker.this.elements) { + if (e.level == level) { + addElement(e); + if (count == treeCount) { + return; + } + } + } + } + } + } + + private abstract static class TreeWorkerJob { + abstract void runJob(); + void clear() { + } + } + private class Fetch extends TreeWorkerJob { + @Override + public void runJob() { + synchronized (FETCH_SYNC) { + if (fetchDone) { + return; // another fetch job won + } + S s = baseLevelSupplier.get(); + synchronized (elements) { + if (s == null) { + fetchDone = true; + // say if all the mapping is done, only after the fetch was done + if (suppliedElements.isEmpty()) { + mapDone = true; + } + } else { + suppliedElements.add(s); + } + elements.notifyAll(); + } + } + } + } + + private class Map extends TreeWorkerJob { + S old; + + public Map(S old) { + this.old = old; + } + + @Override + public void runJob() { + // map the supplied value + T mappedValue = mapFunction.map(old); + + synchronized (TreeWorker.this.elements) { + // add it to the element list + TreeWorker.this.elements.add(new Element(mappedValue, 0)); + + // say if all the mapping is done, only after the fetch was done + if (fetchDone && suppliedElements.isEmpty()) { + mapDone = true; + } + elements.notifyAll(); + } + } + } + + private class Merge extends TreeWorkerJob { + T[] elements; + int count; + int level; + + public Merge(T[] elements, int count, int level) { + this.elements = elements; + this.count = count; + this.level = level; + assert count > 0: "cat from empty element!"; + } + + @Override + public void runJob() { + T t = catFunction.construct(elements, count); + synchronized (TreeWorker.this.elements) { + TreeWorker.this.elements.add(new Element(t, level + 1)); + maxLevel = Math.max(maxLevel, level + 1); + } + } + @Override + void clear() { + for (int i = 0; i < count; i++) { + delete.delete(elements[i]); + } + } + } + + private class Worker extends Thread { + // array used to get merge object + private final Tuple tuple = new Tuple(); + public Worker() { + super("JobWorker#" + JOB_ID_NAME.incrementAndGet()); + } + + @Override + public void run() { + try { + while (!isCompleted()) { + if (listener != null) { + listener.notifyProgress(0, "waiting job"); + } + TreeWorkerJob job = null; + try { + synchronized (WAITING_SYNC) { + job = getJob(); + if (job == null) { + if (isCompleted()) { + return; + } + workerWaiting++; + WAITING_SYNC.wait(); + --workerWaiting; + continue; + } + } + job.runJob(); + synchronized (WAITING_SYNC) { + if (workerWaiting > 0) { + WAITING_SYNC.notify(); + } + } + } catch (Throwable t) { + if (job != null) { + job.clear(); + } + synchronized (elements) { + if (throwable != null) { + throwable.addSuppressed(t); + } + if (t instanceof TreeWorkerException) { + throwable = (TreeWorkerException) t; + } else { + throwable = new TreeWorkerException(t); + } + elements.notifyAll(); + } + synchronized (WAITING_SYNC) { + WAITING_SYNC.notifyAll(); + } + } + } + } finally { + if (listener != null) { + listener.notifyProgress(100, "completed"); + listener.unregisterThread(getName()); + } + synchronized (WORKING_SYNC) { + workerWorking--; + WORKING_SYNC.notify(); + } + } + } + + private TreeWorkerJob getJob() throws TreeWorkerException { + synchronized (elements) { + while (true) { + if (mapDone) { + if (elements.size() == 1) { + return null; // end, no ascend/merge required + } + int level = tuple.searchDir(0, 1, 1); + if (level == -1) { + return null; // size == 0 end + } + if (tuple.size() == 1) { + tuple.getFirstElement().level++; + } else { //size == 2 + tuple.remove(); + return new Merge(tuple.getArray(), tuple.size(), level); + } + } else { + if (fetchDone) { + if (suppliedElements.isEmpty()) { + // edge case if we are waiting for a map to complete, Fetch won't do anything + return new Fetch(); + } + return new Map(suppliedElements.remove(0)); + } + // count the number of supplied elements to know if we need to fetch another one + int level0 = countBase(); + if (workers.size() != 1 && level0 < workers.size() / 2) { + return new Fetch(); + } + // search for a merge candidate with the size treeCount + int level = tuple.searchDir(maxLevel, -1, treeCount); + + if (level != -1) { + // remove the component of the candidate and merge them + tuple.remove(); + return new Merge(tuple.getArray(), tuple.size(), level); + } + + if (suppliedElements.isEmpty()) { + // no supplied element to map, we fetch a new one + return new Fetch(); + } else { + // map the supplied element + return new Map(suppliedElements.remove(0)); + } + } + } + } + } + } + + /** + * An exception in the tree worker + * @author Antoine Willerval + */ + public static class TreeWorkerException extends Exception { + public TreeWorkerException(Throwable cause) { + super(cause); + } + + public TreeWorkerException(String message) { + super(message); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArray.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArray.java new file mode 100644 index 00000000..6874d74e --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArray.java @@ -0,0 +1,24 @@ +package org.rdfhdt.hdt.util.disk; + +/** + * Describe a large array of longs + */ +public interface LongArray { + /** + * get an element at a particular index + * @param index the index + * @return the value + */ + long get(long index); + /** + * Set a new value at the specified position. + * @param index the index + * @param value the value + */ + void set(long index, long value); + + /** + * @return the length of the array + */ + long length(); +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArrayDisk.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArrayDisk.java index d49fc7bb..1651f465 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArrayDisk.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArrayDisk.java @@ -33,7 +33,7 @@ //Implementing an array of longs that is backed up on disk. Following this: http://vanillajava.blogspot.fr/2011/12/using-memory-mapped-file-for-huge.html -public class LongArrayDisk implements Closeable { +public class LongArrayDisk implements Closeable, LongArray { private static final long MAPPING_SIZE = 1 << 30; private FileChannel channel; private CloseMappedByteBuffer[] mappings; @@ -118,6 +118,7 @@ public void close() throws IOException { channel = null; } + @Override public long get(long x) { long p = x * 8; int block = (int) (p / MAPPING_SIZE); @@ -129,6 +130,7 @@ public long getLong(long x) { return this.get(x); } + @Override public void set(long x, long y) { long p = x * 8; int block = (int) (p / MAPPING_SIZE); @@ -136,6 +138,7 @@ public void set(long x, long y) { mappings[block].putLong(offset, y); } + @Override public long length() { return size; } @@ -188,4 +191,5 @@ public long getSizeBits() { return size * 8L; } -} \ No newline at end of file +} + diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseMappedByteBuffer.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseMappedByteBuffer.java index 2603aaa4..216bf7ef 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseMappedByteBuffer.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseMappedByteBuffer.java @@ -19,17 +19,15 @@ static void markMapTest() { static void crashMapTest() { mapTest = false; - MAP_TEST_MAP.entrySet().stream() - .sorted(Comparator.comparingLong(Map.Entry::getKey)) - .map(Map.Entry::getValue) - .forEach(t -> { - System.out.println("-------------------"); - t.printStackTrace(); - System.out.println("-------------------"); - } - ); if (!MAP_TEST_MAP.isEmpty()) { - throw new RuntimeException("MAP NOT CLOSE: " + MAP_TEST_MAP.size()); + AssertionError re = new AssertionError(MAP_TEST_MAP.size() + " MAP(S) NOT CLOSED!"); + + MAP_TEST_MAP.entrySet().stream() + .sorted(Comparator.comparingLong(Map.Entry::getKey)) + .map(Map.Entry::getValue) + .forEach(re::addSuppressed); + + throw re; } } @@ -41,14 +39,18 @@ static void crashMapTest() { this.duplicated = duplicated; this.buffer = buffer; if (mapTest && !duplicated) { - MAP_TEST_MAP.put(id, new Throwable("MAP " + filename + "#" + id + "|"+ buffer)); + synchronized (MAP_TEST_MAP) { + MAP_TEST_MAP.put(id, new Throwable("MAP " + filename + "#" + id + "|" + buffer)); + } } } @Override public void close() { if (mapTest && !duplicated) { - MAP_TEST_MAP.remove(id); + synchronized (MAP_TEST_MAP) { + MAP_TEST_MAP.remove(id); + } } IOUtil.cleanBuffer(buffer); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java new file mode 100644 index 00000000..10c0095a --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java @@ -0,0 +1,247 @@ +package org.rdfhdt.hdt.util.io; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URI; +import java.nio.file.FileSystem; +import java.nio.file.Files; +import java.nio.file.LinkOption; +import java.nio.file.Path; +import java.nio.file.WatchEvent; +import java.nio.file.WatchKey; +import java.nio.file.WatchService; +import java.util.Iterator; +import java.util.Spliterator; +import java.util.function.Consumer; + +/** + * a file that delete itself when we close it + */ +public class CloseSuppressPath implements Path, Closeable { + public static final int BUFFER_SIZE = 1 << 13; + private final Path wrapper; + private boolean isDir; + + CloseSuppressPath(Path wrapper) { + this.wrapper = wrapper; + } + + public static CloseSuppressPath of(String first, String... more) { + return new CloseSuppressPath(Path.of(first, more)); + } + + public static CloseSuppressPath of(Path component) { + return component instanceof CloseSuppressPath ? (CloseSuppressPath) component : new CloseSuppressPath(component); + } + + @Override + public FileSystem getFileSystem() { + return wrapper.getFileSystem(); + } + + @Override + public boolean isAbsolute() { + return wrapper.isAbsolute(); + } + + @Override + public Path getRoot() { + return wrapper.getRoot(); + } + + @Override + public Path getFileName() { + return wrapper.getFileName(); + } + + @Override + public Path getParent() { + return wrapper.getParent(); + } + + @Override + public int getNameCount() { + return wrapper.getNameCount(); + } + + @Override + public Path getName(int index) { + return wrapper.getName(index); + } + + @Override + public Path subpath(int beginIndex, int endIndex) { + return wrapper.subpath(beginIndex, endIndex); + } + + @Override + public boolean startsWith(Path other) { + return wrapper.startsWith(other); + } + + @Override + public boolean startsWith(String other) { + return wrapper.startsWith(other); + } + + @Override + public boolean endsWith(Path other) { + return wrapper.endsWith(other); + } + + @Override + public boolean endsWith(String other) { + return wrapper.endsWith(other); + } + + @Override + public Path normalize() { + return wrapper.normalize(); + } + + @Override + public CloseSuppressPath resolve(Path other) { + return of(wrapper.resolve(other)); + } + + @Override + public CloseSuppressPath resolve(String other) { + return of(wrapper.resolve(other)); + } + + @Override + public CloseSuppressPath resolveSibling(Path other) { + return of(wrapper.resolveSibling(other)); + } + + @Override + public CloseSuppressPath resolveSibling(String other) { + return of(wrapper.resolveSibling(other)); + } + + @Override + public CloseSuppressPath relativize(Path other) { + return of(wrapper.relativize(other)); + } + + @Override + public URI toUri() { + return wrapper.toUri(); + } + + @Override + public Path toAbsolutePath() { + return wrapper.toAbsolutePath(); + } + + @Override + public Path toRealPath(LinkOption... options) throws IOException { + return wrapper.toRealPath(options); + } + + @Override + public File toFile() { + return wrapper.toFile(); + } + + @Override + public WatchKey register(WatchService watcher, WatchEvent.Kind[] events, WatchEvent.Modifier... modifiers) throws IOException { + return wrapper.register(watcher, events, modifiers); + } + + @Override + public WatchKey register(WatchService watcher, WatchEvent.Kind... events) throws IOException { + return wrapper.register(watcher, events); + } + + @Override + public Iterator iterator() { + return wrapper.iterator(); + } + + @Override + public int compareTo(Path other) { + return wrapper.compareTo(other); + } + + @Override + public boolean equals(Object other) { + if (other instanceof CloseSuppressPath) { + return wrapper.equals(((CloseSuppressPath) other).wrapper); + } + return wrapper.equals(other); + } + + @Override + public int hashCode() { + return wrapper.hashCode(); + } + + @Override + public String toString() { + return wrapper.toString(); + } + + @Override + public void forEach(Consumer action) { + wrapper.forEach(action); + } + + @Override + public Spliterator spliterator() { + return wrapper.spliterator(); + } + + private InputStream openInputStream(boolean buffered) throws IOException { + if (buffered) { + return openInputStream(BUFFER_SIZE); + } else { + return Files.newInputStream(wrapper); + } + } + + public InputStream openInputStream(int bufferSize) throws IOException { + return new BufferedInputStream(openInputStream(false), bufferSize); + } + + private OutputStream openOutputStream(boolean buffered) throws IOException { + if (buffered) { + return openOutputStream(BUFFER_SIZE); + } else { + return Files.newOutputStream(wrapper); + } + } + + public OutputStream openOutputStream(int bufferSize) throws IOException { + return new BufferedOutputStream(openOutputStream(false), bufferSize); + } + + /** + * close this path with a delete recurse instead of delete if exists + */ + public void closeWithDeleteRecurse() { + isDir = true; + } + + public void mkdirs() throws IOException { + Files.createDirectories(wrapper); + } + + public Path getJavaPath() { + return wrapper; + } + + @Override + public void close() throws IOException { + if (isDir) { + IOUtil.deleteDirRecurse(wrapper); + } else { + Files.deleteIfExists(wrapper); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java index bbd84658..b56b790a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java @@ -28,25 +28,53 @@ import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.Reference; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.visnow.jlargearrays.LargeArrayUtils; -import java.io.*; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; import java.net.URL; import java.net.URLConnection; +import java.nio.file.FileVisitResult; +import java.nio.file.FileVisitor; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.*; -import java.util.zip.GZIPInputStream; /** * @author mario.arias - * */ public class IOUtil { private static int mappedBuffer; - private IOUtil() {} + + private IOUtil() { + } /** * clean direct allocated buffer @@ -165,25 +193,31 @@ private static void throwIOOrRuntime(Throwable t) throws IOException { } public static InputStream getFileInputStream(String fileName) throws IOException { + return getFileInputStream(fileName, true); + } + + public static InputStream getFileInputStream(String fileName, boolean uncompress) throws IOException { InputStream input; String name = fileName.toLowerCase(); - if(name.startsWith("http:/") || name.startsWith("ftp:/")) { + if (name.startsWith("http:/") || name.startsWith("ftp:/")) { URL url = new URL(fileName); URLConnection con = url.openConnection(); - con.connect(); - input = con.getInputStream(); - } else if(name.equals("-")) { + con.connect(); + input = con.getInputStream(); + } else if (name.equals("-")) { input = new BufferedInputStream(System.in); } else { input = new BufferedInputStream(new FileInputStream(fileName)); } - if(name.endsWith(".gz")||name.endsWith(".tgz")) { - input = new GZIPInputStream(input); - } else if(name.endsWith("bz2") || name.endsWith("bz")) { - input = new BZip2CompressorInputStream(input, true); - } else if(name.endsWith("xz")) { - input = new XZCompressorInputStream(input, true); + if (uncompress) { + if (name.endsWith(".gz") || name.endsWith(".tgz")) { + input = new GZIPInputStream(input); + } else if (name.endsWith("bz2") || name.endsWith("bz")) { + input = new BZip2CompressorInputStream(input, true); + } else if (name.endsWith("xz")) { + input = new XZCompressorInputStream(input, true); + } } return input; } @@ -194,12 +228,12 @@ public static BufferedReader getFileReader(String fileName) throws IOException { public static String readLine(InputStream in, char character) throws IOException { ByteArrayOutputStream buf = new ByteArrayOutputStream(); - while(true) { + while (true) { int value = in.read(); - if(value==-1) { + if (value == -1) { throw new EOFException(); } - if(value==character) { + if (value == character) { break; } buf.write(value); @@ -209,12 +243,12 @@ public static String readLine(InputStream in, char character) throws IOException public static String readChars(InputStream in, int numChars) throws IOException { StringBuilder out = new StringBuilder(); - for(int i=0;in ? n-total : buffer.length); + total += len; + len = (int) (total + buffer.length > n ? n - total : buffer.length); } } @@ -277,33 +311,35 @@ public static void decompressGzip(File src, File trgt) throws IOException { } finally { out.close(); } - }finally { + } finally { in.close(); } } /** * Write long, little endian + * * @param output * @param value * @throws IOException */ public static void writeLong(OutputStream output, long value) throws IOException { byte[] writeBuffer = new byte[8]; - writeBuffer[7] = (byte)(value >>> 56); - writeBuffer[6] = (byte)(value >>> 48); - writeBuffer[5] = (byte)(value >>> 40); - writeBuffer[4] = (byte)(value >>> 32); - writeBuffer[3] = (byte)(value >>> 24); - writeBuffer[2] = (byte)(value >>> 16); - writeBuffer[1] = (byte)(value >>> 8); - writeBuffer[0] = (byte)(value); + writeBuffer[7] = (byte) (value >>> 56); + writeBuffer[6] = (byte) (value >>> 48); + writeBuffer[5] = (byte) (value >>> 40); + writeBuffer[4] = (byte) (value >>> 32); + writeBuffer[3] = (byte) (value >>> 24); + writeBuffer[2] = (byte) (value >>> 16); + writeBuffer[1] = (byte) (value >>> 8); + writeBuffer[0] = (byte) (value); output.write(writeBuffer, 0, 8); } /** * Read long, little endian. + * * @param input * @throws IOException */ @@ -311,25 +347,26 @@ public static long readLong(InputStream input) throws IOException { int n = 0; byte[] readBuffer = new byte[8]; while (n < 8) { - int count = input.read(readBuffer, n , 8-n); + int count = input.read(readBuffer, n, 8 - n); if (count < 0) throw new EOFException(); n += count; } - return ((long)readBuffer[7] << 56) + - ((long)(readBuffer[6] & 255) << 48) + - ((long)(readBuffer[5] & 255) << 40) + - ((long)(readBuffer[4] & 255) << 32) + - ((long)(readBuffer[3] & 255) << 24) + - ((readBuffer[2] & 255) << 16) + - ((readBuffer[1] & 255) << 8) + - ((readBuffer[0] & 255) - ); + return ((long) readBuffer[7] << 56) + + ((long) (readBuffer[6] & 255) << 48) + + ((long) (readBuffer[5] & 255) << 40) + + ((long) (readBuffer[4] & 255) << 32) + + ((long) (readBuffer[3] & 255) << 24) + + ((readBuffer[2] & 255) << 16) + + ((readBuffer[1] & 255) << 8) + + ((readBuffer[0] & 255) + ); } /** * Write int, little endian + * * @param output * @param value * @throws IOException @@ -337,10 +374,10 @@ public static long readLong(InputStream input) throws IOException { public static void writeInt(OutputStream output, int value) throws IOException { byte[] writeBuffer = new byte[4]; writeBuffer[0] = (byte) (value & 0xFF); - writeBuffer[1] = (byte) ((value>>8) & 0xFF); - writeBuffer[2] = (byte) ((value>>16) & 0xFF); - writeBuffer[3] = (byte) ((value>>24) & 0xFF); - output.write(writeBuffer,0,4); + writeBuffer[1] = (byte) ((value >> 8) & 0xFF); + writeBuffer[2] = (byte) ((value >> 16) & 0xFF); + writeBuffer[3] = (byte) ((value >> 24) & 0xFF); + output.write(writeBuffer, 0, 4); } /** @@ -349,14 +386,15 @@ public static void writeInt(OutputStream output, int value) throws IOException { public static byte[] intToByteArray(int value) { byte[] writeBuffer = new byte[4]; writeBuffer[0] = (byte) (value & 0xFF); - writeBuffer[1] = (byte) ((value>>8) & 0xFF); - writeBuffer[2] = (byte) ((value>>16) & 0xFF); - writeBuffer[3] = (byte) ((value>>24) & 0xFF); + writeBuffer[1] = (byte) ((value >> 8) & 0xFF); + writeBuffer[2] = (byte) ((value >> 16) & 0xFF); + writeBuffer[3] = (byte) ((value >> 24) & 0xFF); return writeBuffer; } /** * Read int, little endian + * * @param in input * @return integer * @throws IOException @@ -373,29 +411,30 @@ public static int readInt(InputStream in) throws IOException { /** * Convert byte array to int, little endian + * * @param value */ - public static int byteArrayToInt(byte[] value){ + public static int byteArrayToInt(byte[] value) { return (value[3] << 24) + (value[2] << 16) + (value[1] << 8) + (value[0] << 0); } /** - * @param input din - * @param length bytes + * @param input din + * @param length bytes * @param listener */ public static byte[] readBuffer(InputStream input, int length, ProgressListener listener) throws IOException { int nRead; - int pos=0; + int pos = 0; byte[] data = new byte[length]; - while ((nRead = input.read(data, pos, length-pos)) >0) { + while ((nRead = input.read(data, pos, length - pos)) > 0) { // TODO: Notify progress listener pos += nRead; } - if(pos!=length) { - throw new IOException("EOF while reading array from InputStream"); + if (pos != length) { + throw new EOFException("EOF while reading array from InputStream"); } return data; @@ -404,8 +443,8 @@ public static byte[] readBuffer(InputStream input, int length, ProgressListener public static CharSequence toBinaryString(long val) { StringBuilder str = new StringBuilder(64); int bits = 64; - while(bits-- != 0) { - str.append(((val>>>bits) & 1) !=0 ? '1' : '0'); + while (bits-- != 0) { + str.append(((val >>> bits) & 1) != 0 ? '1' : '0'); } return str; } @@ -413,8 +452,8 @@ public static CharSequence toBinaryString(long val) { public static CharSequence toBinaryString(int val) { StringBuilder str = new StringBuilder(32); int bits = 32; - while(bits-- != 0) { - str.append(((val>>>bits) & 1) !=0 ? '1' : '0'); + while (bits-- != 0) { + str.append(((val >>> bits) & 1) != 0 ? '1' : '0'); } return str; } @@ -425,8 +464,8 @@ public static void printBitsln(long val, int bits) { } public static void printBits(long val, int bits) { - while(bits-- != 0) { - System.out.print( ((val>>>bits) & 1) !=0 ? '1' : '0'); + while (bits-- != 0) { + System.out.print(((val >>> bits) & 1) != 0 ? '1' : '0'); } } @@ -438,7 +477,7 @@ public static short readShort(InputStream in) throws IOException { throw new EOFException(); } - return (short)((ch2 << 8) + (ch1)); + return (short) ((ch2 << 8) + (ch1)); } public static void writeShort(OutputStream out, short value) throws IOException { @@ -451,7 +490,7 @@ public static byte readByte(InputStream in) throws IOException { if (b < 0) { throw new EOFException(); } - return (byte)(b&0xFF); + return (byte) (b & 0xFF); } public static void writeByte(OutputStream out, byte value) throws IOException { @@ -461,18 +500,18 @@ public static void writeByte(OutputStream out, byte value) throws IOException { // InputStream might not skip the specified number of bytes. This call makes multiple calls // if needed to ensure that the desired number of bytes is actually skipped. public static void skip(InputStream in, long n) throws IOException { - if(n==0) { + if (n == 0) { return; } long totalSkipped = in.skip(n); - while(totalSkipped() { + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) { + return FileVisitResult.TERMINATE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } + }); + } + } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeMergeIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeMergeIterator.java new file mode 100644 index 00000000..816ed280 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeMergeIterator.java @@ -0,0 +1,26 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; + +import java.io.IOException; +import java.util.Comparator; +import java.util.List; +import java.util.function.Function; + +/** + * Version of {@link org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator} with {@link org.rdfhdt.hdt.triples.IndexedNode} + * @author Antoine Willerval + */ +public class CompressNodeMergeIterator extends MergeExceptionIterator { + + public CompressNodeMergeIterator(ExceptionIterator in1, ExceptionIterator in2) { + super(in1, in2, IndexedNode::compareTo); + } + + public static > ExceptionIterator buildOfTree( + T[] lst) { + return buildOfTree(it -> it, IndexedNode::compareTo, lst, 0, lst.length); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java new file mode 100644 index 00000000..0b5f0916 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java @@ -0,0 +1,90 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.exceptions.CRCException; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRC8; +import org.rdfhdt.hdt.util.crc.CRCInputStream; +import org.rdfhdt.hdt.util.string.ReplazableString; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; + +/** + * Class to read a compress node file + * + * @author Antoine Willerval + */ +public class CompressNodeReader implements ExceptionIterator, Closeable { + private final CRCInputStream stream; + private final long size; + private long index; + private boolean waiting; + private final IndexedNode last; + private final ReplazableString tempString; + + public CompressNodeReader(InputStream stream) throws IOException { + this.stream = new CRCInputStream(stream, new CRC8()); + this.size = VByte.decode(this.stream); + if(!this.stream.readCRCAndCheck()) { + throw new CRCException("CRC Error while merging Section Plain Front Coding Header."); + } + this.stream.setCRC(new CRC32()); + this.tempString = new ReplazableString(); + this.last = new IndexedNode(tempString, -1); + } + + public long getSize() { + return size; + } + + public void checkComplete() throws IOException { + if(!this.stream.readCRCAndCheck()) { + throw new CRCException("CRC Error while merging Section Plain Front Coding Header."); + } + } + + /** + * @return the next element without passing to the next element + * @throws IOException reading exception + */ + public IndexedNode read() throws IOException { + if (waiting) { + return last; + } + int delta = (int) VByte.decode(stream); + tempString.replace2(stream, delta); + long index = VByte.decode(stream); + last.setIndex(index); + waiting = true; + return last; + } + + /** + * pass to the next element, mandatory with {@link #read()} + */ + public void pass() { + waiting = false; + index++; + } + + @Override + public IndexedNode next() throws IOException { + IndexedNode node = read(); + pass(); + return node; + } + @Override + public boolean hasNext() throws IOException { + return index < size; + } + + @Override + public void close() throws IOException { + stream.close(); + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java new file mode 100644 index 00000000..ebdcc534 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java @@ -0,0 +1,62 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRC8; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; +import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.CompactString; +import org.rdfhdt.hdt.util.string.ReplazableString; + +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStream; + +/** + * Class to write a compress node file + * + * @author Antoine Willerval + */ +public class CompressNodeWriter implements Closeable { + private final CRCOutputStream out; + private final ReplazableString previousStr = new ReplazableString(); + + public CompressNodeWriter(OutputStream stream, long size) throws IOException { + this.out = new CRCOutputStream(stream, new CRC8()); + VByte.encode(this.out, size); + this.out.writeCRC(); + this.out.setCRC(new CRC32()); + } + + public void appendNode(IndexedNode node) throws IOException { + CharSequence str = node.getNode(); + long index = node.getIndex(); + + // to avoid bad longestCommonPrefix call + // cf: https://github.com/rdfhdt/hdt-java/issues/165 + if (str instanceof String) { + str = new CompactString(str); + } + + // Find common part. + int delta = ByteStringUtil.longestCommonPrefix(previousStr, str); + // Write Delta in VByte + VByte.encode(out, delta); + // Write remaining + ByteStringUtil.append(out, str, delta); + out.write(0); // End of string + VByte.encode(out, index); // index of the node + previousStr.replace(str); + } + + public void writeCRC() throws IOException { + out.writeCRC(); + } + + @Override + public void close() throws IOException{ + writeCRC(); + out.close(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleMergeIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleMergeIterator.java new file mode 100644 index 00000000..f9dcfa13 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleMergeIterator.java @@ -0,0 +1,27 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.TripleIDComparator; + +import java.io.IOException; +import java.util.List; + +/** + * Version of {@link org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator} with {@link org.rdfhdt.hdt.triples.TripleID} + * @author Antoine Willerval + */ +public class CompressTripleMergeIterator extends MergeExceptionIterator { + + public CompressTripleMergeIterator(ExceptionIterator in1, ExceptionIterator in2, TripleComponentOrder order) { + super(in1, in2, TripleIDComparator.getComparator(order)); + } + + public static > ExceptionIterator buildOfTree( + T[] lst, TripleComponentOrder order) { + return buildOfTree(it -> it, TripleIDComparator.getComparator(order), lst, 0, lst.length); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleReader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleReader.java new file mode 100644 index 00000000..2de3cbb9 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleReader.java @@ -0,0 +1,87 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.exceptions.CRCException; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRCInputStream; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; + +/** + * Class to read and map pre-mapped a triples file + * + * @author Antoine Willerval + */ +public class CompressTripleReader implements ExceptionIterator, Closeable { + private final CRCInputStream stream; + private final TripleID next = new TripleID(-1, -1, -1); + private boolean read = false, end = false; + + public CompressTripleReader(InputStream stream) { + this.stream = new CRCInputStream(stream, new CRC32()); + } + + @Override + public boolean hasNext() throws IOException { + if (read) { + return true; + } + + // the reader is empty, null end triple + if (end) { + return false; + } + + long s, p, o; + + do { + s = VByte.decode(stream); + p = VByte.decode(stream); + o = VByte.decode(stream); + // continue to read to avoid duplicated triples + } while (s == next.getSubject() && p == next.getPredicate() && o == next.getObject()); + + return !setAllOrEnd(s, p, o); + } + + private boolean setAllOrEnd(long s, long p, long o) throws IOException { + if (end) { + // already completed + return true; + } + if (s == 0 || p == 0 || o == 0) { + // check triples validity + if (s != 0 || p != 0 || o != 0) { + throw new IOException("Triple got null node, but not all the nodes are 0! " + s + " " + p + " " + o); + } + if (!stream.readCRCAndCheck()) { + throw new CRCException("CRC Error while reading PreMapped triples."); + } + // set to true to avoid reading again the CRC + end = true; + return true; + } + // map the triples to the end id, compute the shared with the end shared size + next.setAll(s, p, o); + read = true; + return false; + } + + @Override + public TripleID next() throws IOException { + if (!hasNext()) { + return null; + } + read = false; + return next; + } + + @Override + public void close() throws IOException { + stream.close(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleWriter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleWriter.java new file mode 100644 index 00000000..837e1206 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleWriter.java @@ -0,0 +1,61 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.triples.IndexedTriple; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; + +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStream; + +/** + * Class to write pre-mapped triples file + * + * @author Antoine Willerval + */ +public class CompressTripleWriter implements Closeable { + private final CRCOutputStream out; + + public CompressTripleWriter(OutputStream writer) { + this.out = new CRCOutputStream(writer, new CRC32()); + } + /** + * write a indexed triple into an output + * @param triple the triple to write + * @throws java.io.IOException write exception + */ + public void appendTriple(IndexedTriple triple) throws IOException { + VByte.encode(out, triple.getSubject().getIndex()); + VByte.encode(out, triple.getPredicate().getIndex()); + VByte.encode(out, triple.getObject().getIndex()); + } + /** + * write a indexed triple into an output + * @param triple the triple to write + * @throws java.io.IOException write exception + */ + public void appendTriple(TripleID triple) throws IOException { + VByte.encode(out, triple.getSubject()); + VByte.encode(out, triple.getPredicate()); + VByte.encode(out, triple.getObject()); + } + + /** + * Write an end triple and a CRC to complete the writer + * @throws IOException write error + */ + public void writeCRC() throws IOException { + VByte.encode(out, 0); + VByte.encode(out, 0); + VByte.encode(out, 0); + out.writeCRC(); + } + + @Override + public void close() throws IOException { + writeCRC(); + out.close(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java new file mode 100644 index 00000000..4849a0a9 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java @@ -0,0 +1,202 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.ReplazableString; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; + +/** + * Utility class to manipulate compressed node + * + * @author Antoine Willerval + */ +public class CompressUtil { + /** + * the mask for shared computed compressed node + */ + public static final long SHARED_MASK = 1L; + /** + * shift after the SHARED/DUPLICATES + */ + public static final int INDEX_SHIFT = 1; + + /** + * write a sorted list of indexed node + * + * @param strings the nodes to write + * @param output the output + * @param listener the listener to see the progress + * @throws IOException writing exception + */ + public static void writeCompressedSection(List strings, OutputStream output, ProgressListener listener) throws IOException { + writeCompressedSection(ExceptionIterator.of(strings.iterator()), strings.size(), output, listener); + } + + /** + * write a sorted iterator of indexed node + * + * @param it iterator to write + * @param size size of the iterator + * @param output the output where to write + * @param listener the listener to see the progress + * @throws IOException writing exception + */ + public static void writeCompressedSection(ExceptionIterator it, long size, OutputStream output, ProgressListener listener) throws IOException { + CompressNodeWriter writer = new CompressNodeWriter(output, size); + long element = 0; + long block = size < 10 ? 1 : size / 10; + while (it.hasNext()) { + if (listener != null && element % block == 0) { + listener.notifyProgress((float) (10 * element / block), "write section " + element + "/" + size); + } + writer.appendNode(it.next()); + element++; + } + it.forEachRemaining(writer::appendNode); + writer.writeCRC(); + if (listener != null) { + listener.notifyProgress(100, "section completed " + size + " nodes"); + } + } + + /** + * merge two stream together into an output stream + * + * @param stream1 input stream 1 + * @param stream2 input stream 2 + * @param output output stream + * @param listener the listener to see the progress + * @throws IOException read/writing exception + */ + public static void mergeCompressedSection(InputStream stream1, InputStream stream2, OutputStream output, ProgressListener listener) throws IOException { + CompressNodeReader in1r = new CompressNodeReader(stream1); + CompressNodeReader in2r = new CompressNodeReader(stream2); + + long size1 = in1r.getSize(); + long size2 = in2r.getSize(); + + // merge the section + writeCompressedSection(new CompressNodeMergeIterator(in1r, in2r), size1 + size2, output, listener); + // check we have completed the 2 readers + in1r.checkComplete(); + in2r.checkComplete(); + } + + /** + * compute the shared-computed id from a shared-computable id + * + * @param id the shared-computable id + * @param sharedCount the count of shared elements + * @return the shared-computed element + */ + public static long computeSharedNode(long id, long sharedCount) { + if ((id & SHARED_MASK) != 0) { + // shared element + return CompressUtil.getId(id); + } + // not shared + return CompressUtil.getId(id) + sharedCount; + } + + /** + * convert this id to a shared-computable element + * + * @param id the id + * @return shared-computable element + */ + public static long asShared(long id) { + return getHeaderId(id) | SHARED_MASK; + } + + /** + * get the id from a header id + * @param headerId the header id + * @return the id + */ + public static long getId(long headerId) { + return headerId >>> INDEX_SHIFT; + } + + /** + * get a header id from an id + * @param id the id + * @return the header id + */ + public static long getHeaderId(long id) { + return id << INDEX_SHIFT; + } + + /** + * @return a char sequence base iterator view of this iterator + */ + public static DuplicatedIterator asNoDupeCharSequenceIterator(ExceptionIterator nodes, DuplicatedNodeConsumer duplicatedNodeConsumer) { + return new DuplicatedIterator(nodes.asIterator(), duplicatedNodeConsumer); + } + + @FunctionalInterface + public interface DuplicatedNodeConsumer { + void onDuplicated(long originalIndex, long duplicatedIndex, long originalHeader); + } + + public static class DuplicatedIterator implements Iterator { + private final Iterator it; + private final ReplazableString prev = new ReplazableString(); + private IndexedNode next; + private long id; + private final DuplicatedNodeConsumer duplicatedNodeConsumer; + private long lastHeader; + + DuplicatedIterator(Iterator it, DuplicatedNodeConsumer duplicatedNodeConsumer) { + this.it = it; + this.duplicatedNodeConsumer = Objects.requireNonNullElseGet(duplicatedNodeConsumer, () -> (i, j, k) -> { + }); + } + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + while (it.hasNext()) { + IndexedNode node = it.next(); + CharSequence next = node.getNode(); + if (CharSequenceComparator.getInstance().compare(prev, next) == 0) { + // same as previous, ignore + assert this.id != node.getIndex() : "same index and prevIndex"; + duplicatedNodeConsumer.onDuplicated(this.id, node.getIndex(), lastHeader); + continue; + } + this.next = node; + prev.replace(next); + this.id = node.getIndex(); + return true; + } + return false; + } + + @Override + public IndexedNode next() { + if (!hasNext()) { + return null; + } + IndexedNode old = next; + next = null; + return old; + } + + public void setLastHeader(long lastHeader) { + this.lastHeader = lastHeader; + } + } + + private CompressUtil() { + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/MapCompressTripleMerger.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/MapCompressTripleMerger.java new file mode 100644 index 00000000..b055e54e --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/MapCompressTripleMerger.java @@ -0,0 +1,244 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.hdt.impl.diskimport.*; +import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.SizeFetcher; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.TripleIDComparator; +import org.rdfhdt.hdt.util.ParallelSortableArrayList; +import org.rdfhdt.hdt.util.concurrent.KWayMerger; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Supplier; + +/** + * TreeWorkerObject implementation to map and merge tripleID from a compress triple file + * + * @author Antoine Willerval + */ +public class MapCompressTripleMerger implements KWayMerger.KWayMergerImpl> { + private static final Logger log = LoggerFactory.getLogger(MapCompressTripleMerger.class); + private final CloseSuppressPath baseFileName; + private final AsyncIteratorFetcher source; + private final CompressTripleMapper mapper; + private final MultiThreadListener listener; + private final TripleComponentOrder order; + private final int bufferSize; + private final int k; + private final AtomicLong triplesCount = new AtomicLong(); + private final long chunkSize; + + public MapCompressTripleMerger(CloseSuppressPath baseFileName, AsyncIteratorFetcher source, CompressTripleMapper mapper, MultiThreadListener listener, TripleComponentOrder order, int bufferSize, long chunkSize, int k) { + this.baseFileName = baseFileName; + this.source = source; + this.mapper = mapper; + this.listener = listener; + this.order = order; + this.bufferSize = bufferSize; + this.chunkSize = chunkSize; + this.k = k; + } + + /** + * merge these triples into a file + * + * @param workers number of worker + * @return result + * @throws KWayMerger.KWayMergerException TreeWorker error + * @throws InterruptedException thread interruption + * @throws IOException io error + */ + public TripleCompressionResult mergeToFile(int workers) throws InterruptedException, IOException, KWayMerger.KWayMergerException { + // force to create the first file + KWayMerger> merger = new KWayMerger<>(baseFileName, source, this, Math.max(1, workers - 1), k); + merger.start(); + // wait for the workers to merge the sections and create the triples + Optional sections = merger.waitResult(); + if (sections.isEmpty()) { + return new TripleCompressionResultEmpty(order); + } + return new TripleCompressionResultFile(triplesCount.get(), sections.get(), order, bufferSize); + } + + /** + * merge these triples while reading them, increase the memory usage + * + * @return result + * @throws IOException io error + */ + public TripleCompressionResult mergeToPartial() throws IOException, KWayMerger.KWayMergerException { + List files = new ArrayList<>(); + try { + baseFileName.mkdirs(); + baseFileName.closeWithDeleteRecurse(); + long fileName = 0; + while (!source.isEnd()) { + CloseSuppressPath file = baseFileName.resolve("chunk#" + fileName++); + createChunk(newStopFlux(source), file); + files.add(file); + } + } catch (Throwable e) { + try { + throw e; + } finally { + try { + IOUtil.closeAll(files); + } finally { + baseFileName.close(); + } + } + } + return new TripleCompressionResultPartial(files, triplesCount.get(), order, bufferSize) { + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + baseFileName.close(); + } + } + }; + } + + /** + * merge the triples into a result + * + * @param workers number of workers (complete mode) + * @param mode the mode of merging + * @return result + * @throws KWayMerger.KWayMergerException TreeWorker error (complete mode) + * @throws InterruptedException thread interruption (complete mode) + * @throws IOException io error + */ + public TripleCompressionResult merge(int workers, String mode) throws KWayMerger.KWayMergerException, InterruptedException, IOException { + if (mode == null) { + mode = ""; + } + switch (mode) { + case "": + case CompressionResult.COMPRESSION_MODE_COMPLETE: + return mergeToFile(workers); + case CompressionResult.COMPRESSION_MODE_PARTIAL: + return mergeToPartial(); + default: + throw new IllegalArgumentException("Unknown compression mode: " + mode); + } + } + + @Override + public void createChunk(SizeFetcher flux, CloseSuppressPath output) throws KWayMerger.KWayMergerException { + BufferedTriples buffer = new BufferedTriples(); + ParallelSortableArrayList tripleIDS = buffer.triples; + listener.notifyProgress(10, "reading triples part2 " + triplesCount); + TripleID next; + while ((next = flux.get()) != null) { + TripleID mappedTriple = new TripleID( + mapper.extractSubject(next.getSubject()), + mapper.extractPredicate(next.getPredicate()), + mapper.extractObjects(next.getObject()) + ); + assert mappedTriple.isValid(); + tripleIDS.add(mappedTriple); + long count = triplesCount.incrementAndGet(); + if (count % 100_000 == 0) { + listener.notifyProgress(10, "reading triples part2 " + triplesCount); + } + if (tripleIDS.size() == Integer.MAX_VALUE - 6) { + break; + } + } + try { + tripleIDS.parallelSort(TripleIDComparator.getComparator(order)); + int count = 0; + int block = tripleIDS.size() < 10 ? 1 : tripleIDS.size() / 10; + IntermediateListener il = new IntermediateListener(listener); + il.setRange(70, 100); + il.setPrefix("writing triples " + output.getFileName() + " "); + try (CompressTripleWriter w = new CompressTripleWriter(output.openOutputStream(bufferSize))) { + il.notifyProgress(0, "creating file"); + TripleID prev = new TripleID(-1, -1, -1); + for (TripleID triple : tripleIDS) { + count++; + if (count % block == 0) { + il.notifyProgress(count / (block / 10f), "writing triples " + count + "/" + tripleIDS.size()); + } + if (prev.match(triple)) { + continue; + } + prev.setAll(triple.getSubject(), triple.getPredicate(), triple.getObject()); + w.appendTriple(triple); + } + listener.notifyProgress(100, "writing completed " + triplesCount + " " + output.getFileName()); + } + } catch (IOException e) { + throw new KWayMerger.KWayMergerException(e); + } + } + + @Override + public void mergeChunks(List inputs, CloseSuppressPath output) throws KWayMerger.KWayMergerException { + try { + listener.notifyProgress(0, "merging triples " + output.getFileName()); + CompressTripleReader[] readers = new CompressTripleReader[inputs.size()]; + try { + for (int i = 0; i < inputs.size(); i++) { + readers[i] = new CompressTripleReader(inputs.get(i).openInputStream(bufferSize)); + } + + try (CompressTripleWriter w = new CompressTripleWriter(output.openOutputStream(bufferSize))) { + ExceptionIterator it = CompressTripleMergeIterator.buildOfTree(readers, order); + while (it.hasNext()) { + w.appendTriple(it.next()); + } + } + } finally { + IOUtil.closeAll(readers); + } + listener.notifyProgress(100, "triples merged " + output.getFileName()); + // delete old triples + IOUtil.closeAll(inputs); + } catch (IOException e) { + throw new KWayMerger.KWayMergerException(e); + } + } + + @Override + public SizeFetcher newStopFlux(Supplier flux) { + return SizeFetcher.ofTripleLong(flux, chunkSize); + } + + public static class TripleFile implements Closeable { + long triples; + CloseSuppressPath path; + + private TripleFile(long triples, CloseSuppressPath path) { + this.triples = triples; + this.path = path; + } + + @Override + public void close() throws IOException { + path.close(); + } + } + + public static class BufferedTriples { + ParallelSortableArrayList triples = new ParallelSortableArrayList<>(TripleID[].class); + + private BufferedTriples() { + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/NoDuplicateTripleIDIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/NoDuplicateTripleIDIterator.java new file mode 100644 index 00000000..bc4d003b --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/NoDuplicateTripleIDIterator.java @@ -0,0 +1,95 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.enums.ResultEstimationType; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.triples.IteratorTripleID; +import org.rdfhdt.hdt.triples.TripleID; + +/** + * a iterator triple id implementation remove duplicated + * @author Antoine Willerval + */ +public class NoDuplicateTripleIDIterator implements IteratorTripleID { + private TripleID next; + private final TripleID prev = new TripleID(-1, -1, -1); + private final IteratorTripleID it; + + public NoDuplicateTripleIDIterator(IteratorTripleID it) { + this.it = it; + } + + @Override + public boolean hasNext() { + while (this.next == null) { + if (!it.hasNext()) { + return false; + } + + TripleID next = it.next(); + + if (next.match(prev)) { + continue; + } + prev.setAll(next.getSubject(), next.getPredicate(), next.getObject()); + + this.next = next; + } + return true; + } + + @Override + public TripleID next() { + if (!hasNext()) { + return null; + } + TripleID next = this.next; + this.next = null; + return next; + } + + @Override + public boolean hasPrevious() { + throw new NotImplementedException(); + } + + @Override + public TripleID previous() { + throw new NotImplementedException(); + } + + @Override + public void goToStart() { + throw new NotImplementedException(); + } + + @Override + public boolean canGoTo() { + return false; + } + + @Override + public void goTo(long pos) { + throw new NotImplementedException(); + } + + @Override + public long estimatedNumResults() { + return it.estimatedNumResults(); + } + + @Override + public ResultEstimationType numResultEstimation() { + return it.numResultEstimation(); + } + + @Override + public TripleComponentOrder getOrder() { + return it.getOrder(); + } + + @Override + public long getLastTriplePosition() { + throw new NotImplementedException(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/TripleGenerator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/TripleGenerator.java new file mode 100644 index 00000000..bea9096d --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/TripleGenerator.java @@ -0,0 +1,28 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.triples.TripleID; + +import java.util.Iterator; + +/** + * Utility class to generate triples + */ +public class TripleGenerator implements Iterator { + private final long triples; + private long current = 1; + + public TripleGenerator(long triples) { + this.triples = triples; + } + + @Override + public boolean hasNext() { + return current <= triples; + } + + @Override + public TripleID next() { + long c = current++; + return new TripleID(c, c, c); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/WriteLongArrayBuffer.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/WriteLongArrayBuffer.java new file mode 100644 index 00000000..4da21cc3 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/WriteLongArrayBuffer.java @@ -0,0 +1,224 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.disk.LongArray; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; + +/** + * A class to buffer write to a long array to chunk the sets and sort them by index before calling them + * @author Antoine Willerval + */ +public class WriteLongArrayBuffer implements LongArray, Closeable { + // debug field + private static final boolean DISABLE_BUFFER = true; + private final LongArray array; + private ArrayElementLong[] bufferLong; + private ArrayElementInt[] bufferInt; + private int index = 0; + private boolean lastOrder; + + /** + * create the buffer + * @param array the array to write + * @param maxValue the maximum value, to use int64 or int32 + * @param maxElement count of long elements to store + */ + public WriteLongArrayBuffer(LongArray array, long maxValue, int maxElement) { + this.array = array; + if (!DISABLE_BUFFER) { + int bits = BitUtil.log2(maxValue + 2) + CompressUtil.INDEX_SHIFT; // + 1 for shared + + if (bits > 31) { + bufferLong = new ArrayElementLong[maxElement / 3]; + } else { + // we can store twice as many elements, so we add * 2L + bufferInt = new ArrayElementInt[(int) (maxElement / 3)]; + } + } + } + + /** + * clear all the elements + */ + public void clear() { + index = 0; + } + + public void free() { + flush(); + bufferInt = null; + bufferLong = null; + System.gc(); + } + + private ArrayElement get(int index) { + if (bufferLong != null) { + return bufferLong[index]; + } else if (bufferInt != null) { + return bufferInt[index]; + } else { + throw new IllegalArgumentException("free buffer!"); + } + } + + private void checkConsistency() { + if (size() == maxCapacity()) { + flush(); + } + } + + /** + * write all the sets and clear the buffer + */ + public void flush() { + // ignore empty array + if (size() == 0) { + return; + } + + // sort the set calls + if (bufferLong != null) { + Arrays.sort(bufferLong, 0, size(), ArrayElement::compareTo); + } else if (bufferInt != null) { + Arrays.sort(bufferInt, 0, size(), ArrayElement::compareTo); + } else { + return; + } + + // reverse the order to write from the end to the start + if (lastOrder) { + for (int i = 0; i < index; i++) { + ArrayElement e = get(i); + array.set(e.getIndex(), e.getValue()); + } + } else { + for (int i = index - 1; i >= 0; i--) { + ArrayElement e = get(i); + array.set(e.getIndex(), e.getValue()); + } + } + // reverse for next run + lastOrder = !lastOrder; + // clear the buffer + clear(); + } + + /** + * get a value of the buffer, will flush all remaining sets before + * @param index the index + * @return the value + */ + @Override + public long get(long index) { + flush(); + return array.get(index); + } + + /** + * set a value in the array + * @param index the index + * @param value the value to set + */ + @Override + public void set(long index, long value) { + if (DISABLE_BUFFER) { + array.set(index, value); + return; + } + if (bufferLong != null) { + bufferLong[this.index++] = new ArrayElementLong(index, value); + } else { + bufferInt[this.index++] = new ArrayElementInt(index, value); + } + // check for flush + checkConsistency(); + } + + /** + * get the length of the array, will flush remaining sets before + * @return the length of the array + */ + @Override + public long length() { + flush(); + return array.length(); + } + + /** + * @return the used size of the buffer + */ + public int size() { + return index; + } + + /** + * @return the max capacity of the buffer + */ + public int maxCapacity() { + if (bufferLong != null) { + return bufferLong.length; + } else { + return bufferInt.length; + } + } + + @Override + public void close() throws IOException { + flush(); + if (array instanceof Closeable) { + ((Closeable) array).close(); + } + } + + private interface ArrayElement extends Comparable { + long getIndex(); + + long getValue(); + + @Override + default int compareTo(ArrayElement o) { + return Long.compare(getIndex(), o.getIndex()); + } + } + + private static class ArrayElementLong implements ArrayElement { + private final long index, value; + + public ArrayElementLong(long index, long value) { + this.index = index; + this.value = value; + } + + @Override + public long getIndex() { + return index; + } + + @Override + public long getValue() { + return value; + } + } + + private static class ArrayElementInt implements ArrayElement { + private final int index, value; + + public ArrayElementInt(long index, long value) { + this.index = (int) index; + this.value = (int) value; + } + + @Override + public long getIndex() { + return index; + } + + @Override + public long getValue() { + return value; + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/IntermediateListener.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/IntermediateListener.java index 8ff76bf0..ff47e6a6 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/IntermediateListener.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/IntermediateListener.java @@ -45,16 +45,48 @@ public class IntermediateListener implements ProgressListener { private final ProgressListener child; private float min, max; - + private String prefix; + /** * Create an IntermediateListener that translates notifications of a * child into a broader range. - * @param child + * @param child child listener */ public IntermediateListener(ProgressListener child) { + this(child, 0, 100); + } + /** + * Create an IntermediateListener that translates notifications of a + * child into a broader range. + * @param child child listener + * @param min minimum value + * @param max maximum value + */ + public IntermediateListener(ProgressListener child, float min, float max) { + this(child, min, max, ""); + } + /** + * Create an IntermediateListener that translates notifications of a + * child into a broader range. + * @param child child listener + * @param prefix prefix of this listener + */ + public IntermediateListener(ProgressListener child, String prefix) { + this(child, 0, 100, prefix); + } + /** + * Create an IntermediateListener that translates notifications of a + * child into a broader range. + * @param child child listener + * @param min minimum value + * @param max maximum value + * @param prefix prefix of this listener + */ + public IntermediateListener(ProgressListener child, float min, float max, String prefix) { this.child = child; - this.min = 0; - this.max = 100; + this.min = min; + this.max = max; + this.prefix = prefix; } /** @@ -67,7 +99,7 @@ public IntermediateListener(ProgressListener child) { public void notifyProgress(float level, String message) { if(child!=null) { float newlevel = min + level*(max-min)/100; - child.notifyProgress(newlevel,message); + child.notifyProgress(newlevel, prefix + message); } } @@ -76,12 +108,19 @@ public void notifyProgress(float level, String message) { * when the child notifies 0, this IntermediateListener notifies the parent with 20%, and when * the child notifies 100, the IntermediateListener notifies 40. Any intermediate values are * linearly interpolated. - * @param min - * @param max + * @param min minimum value + * @param max maximum value */ public void setRange(float min, float max) { this.min = min; this.max = max; } + /** + * Set the prefix for this listener, will be put before the messages of this listener + * @param prefix the prefix + */ + public void setPrefix(String prefix) { + this.prefix = prefix; + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java index a8d8edaf..82575686 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java @@ -27,7 +27,9 @@ package org.rdfhdt.hdt.util.listener; +import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.concurrent.SyncListener; /** * @author mario.arias @@ -45,7 +47,7 @@ public static void notify(ProgressListener listener, String message, float value public static void notifyCond(ProgressListener listener, String message, long value, long total) { if(listener!=null && (value%5000==0)) { - listener.notifyProgress( ((value)*100/total), message); + listener.notifyProgress( (float) ((value)*100/total), message); } } @@ -54,4 +56,23 @@ public static void notifyCond(ProgressListener listener, String message, long co listener.notifyProgress( ((value)*100/total), message); } } + + /** + * convert a progress listener to a {@link org.rdfhdt.hdt.listener.MultiThreadListener} + * @param listener the listener + * @return a new multi thread listener, or the listener if it was multi + */ + public static MultiThreadListener multiThreadListener(ProgressListener listener) { + // null, create an empty one + if (listener == null) { + return new PrefixMultiThreadListener((a, b) -> { + }); + } + // already a multi thread listener + if (listener instanceof MultiThreadListener) { + return (MultiThreadListener) listener; + } + // create a sync version of a prefix one + return new PrefixMultiThreadListener(SyncListener.of(listener)); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixMultiThreadListener.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixMultiThreadListener.java new file mode 100644 index 00000000..b424213c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixMultiThreadListener.java @@ -0,0 +1,24 @@ +package org.rdfhdt.hdt.util.listener; + +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.listener.ProgressListener; + +/** + * Simple implementation of {@link org.rdfhdt.hdt.listener.MultiThreadListener} redirecting all the progression to + * a progression listener with a prefix + * + * @author Antoine Willerval + */ +public class PrefixMultiThreadListener implements MultiThreadListener { + + private final ProgressListener progressListener; + + public PrefixMultiThreadListener(ProgressListener progressListener) { + this.progressListener = progressListener; + } + + @Override + public void notifyProgress(String thread, float level, String message) { + progressListener.notifyProgress(level, "[" + thread + "]" + message); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java index 8e71a7c7..6ba9ecc4 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java @@ -26,7 +26,9 @@ package org.rdfhdt.hdt.util.string; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.charset.Charset; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java index 0d63c9d0..48865bef 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import org.rdfhdt.hdt.exceptions.NotImplementedException; @@ -57,7 +58,7 @@ public ReplazableString(int initialCapacity) { used=0; } - private ReplazableString(byte [] buffer) { + public ReplazableString(byte [] buffer) { this.buffer = buffer; this.used = buffer.length; } @@ -71,7 +72,7 @@ private void ensureSize(int size) { buffer = Arrays.copyOf(buffer, Math.max(size, buffer.length * 2)); } } - + public void append(byte [] data, int offset, int len) { this.replace(used, data, offset, len); } @@ -79,7 +80,7 @@ public void append(byte [] data, int offset, int len) { public void append(BigByteBuffer data, long offset, int len) { this.replace(used, data, offset, len); } - + public void append(CharSequence other) { ensureSize(this.used+other.length()); for(int i=0;i params() { + return Arrays.asList( + new SequenceGenerator( + "SequenceLog64BigDisk", + SequenceLog64BigDisk::new + ), + new SequenceGenerator( + "SequenceLog64", + ((workFile, bits, elements) -> new SequenceLog64(bits, elements)) + ), + new SequenceGenerator( + "SequenceLog64Big", + ((workFile, bits, elements) -> new SequenceLog64Big(bits, elements)) + ) + ); + } + + @Parameterized.Parameter + public SequenceGenerator sequenceGenerator; + + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + private void sequenceTest(int bits, long elements, boolean trim) throws IOException { + long maxMask = (~0L) >>> (Long.SIZE - bits); + + Path p = tempDir.newFolder().toPath(); + try (DynamicSequence actual = sequenceGenerator.bld.generate( + p.resolve("test.seq").toString(), + trim ? 64 : bits, + elements) + ) { + { + Random rnd = new Random(32); + for (long i = 0; i < elements; i++) { + long v = rnd.nextLong() & maxMask; + if (v < 0) { + v = -v; + } + actual.append(v); + } + } + { + Random rnd = new Random(32); + for (long i = 0; i < elements; i++) { + long v = rnd.nextLong() & maxMask; + if (v < 0) { + v = -v; + } + Assert.assertEquals(actual.get(i), v); + } + } + if (trim) { + actual.aggressiveTrimToSize(); + } + { + Random rnd = new Random(32); + for (long i = 0; i < elements; i++) { + long v = rnd.nextLong() & maxMask; + if (v < 0) { + v = -v; + } + Assert.assertEquals("actual fail", actual.get(i), v); + } + } + } + } + + @Test + public void littleTest() throws IOException { + sequenceTest(64, 100L, false); + } + + @Test + public void bit64Test() throws IOException { + sequenceTest(64, 10_000L, false); + } + + @Test + public void bit32Test() throws IOException { + sequenceTest(32, 10_000L, false); + } + + @Test + public void bit64TrimTest() throws IOException { + sequenceTest(64, 10_000L, true); + } + + @Test + public void bit32TrimTest() throws IOException { + sequenceTest(32, 10_000L, true); + } + + private static class SequenceGenerator { + final String name; + final SequenceGeneratorBuilder bld; + + public SequenceGenerator(String name, SequenceGeneratorBuilder bld) { + this.name = name; + this.bld = bld; + } + + @Override + public String toString() { + return name; + } + } + + @FunctionalInterface + private interface SequenceGeneratorBuilder { + DynamicSequence generate(String workFile, int bits, long elements); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java new file mode 100644 index 00000000..13e0315b --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java @@ -0,0 +1,177 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.FileTripleIterator; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.io.compress.CompressTest; +import org.rdfhdt.hdt.util.string.ByteStringUtil; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +public class CompressFourSectionDictionaryTest { + @Test + public void compressDictTest() throws Exception { + TestCompressionResult result = new TestCompressionResult( + new CharSequence[]{ + "2222", "4444", "5555", "7777", "9999", "9999" + }, + new CharSequence[]{ + "1111", "1111", "2222", "3333", "3333", "4444" + }, + new CharSequence[]{ + "1111", "3333", "3333", "4444", "6666", "7777", "8888" + } + ); + List exceptedSubjects = Arrays.asList( + "2222", "5555", "9999" + ); + List exceptedPredicates = Arrays.asList( + "1111", "2222", "3333", "4444" + ); + List exceptedObjects = Arrays.asList( + "1111", "3333", "6666", "8888" + ); + List exceptedShared = Arrays.asList( + "4444", "7777" + ); + CompressFourSectionDictionary dictionary = new CompressFourSectionDictionary(result, new FakeNodeConsumer(), (p, m) -> { + }); + Iterator su = dictionary.getSubjects().getSortedEntries(); + Iterator pr = dictionary.getPredicates().getSortedEntries(); + Iterator ob = dictionary.getObjects().getSortedEntries(); + Iterator sh = dictionary.getShared().getSortedEntries(); + ExceptionThread subjectReader = new ExceptionThread(() -> { + for (CharSequence e : exceptedSubjects) { + Assert.assertTrue(su.hasNext()); + CharSequence a = su.next(); + Thread.sleep(40); + CompressTest.assertCharSequenceEquals("Subject", e, a); + } + }, "compressDictTestS"); + ExceptionThread predicateReader = new ExceptionThread(() -> { + for (CharSequence e : exceptedPredicates) { + Assert.assertTrue(pr.hasNext()); + CharSequence a = pr.next(); + Thread.sleep(40); + CompressTest.assertCharSequenceEquals("Predicate", e, a); + } + }, "compressDictTestP"); + ExceptionThread objectReader = new ExceptionThread(() -> { + for (CharSequence e : exceptedObjects) { + Assert.assertTrue(ob.hasNext()); + CharSequence a = ob.next(); + Thread.sleep(40); + CompressTest.assertCharSequenceEquals("Object", e, a); + } + }, "compressDictTestO"); + ExceptionThread sharedReader = new ExceptionThread(() -> { + for (CharSequence e : exceptedShared) { + Assert.assertTrue(sh.hasNext()); + CharSequence a = sh.next(); + Thread.sleep(40); + CompressTest.assertCharSequenceEquals("Shared", e, a); + } + }, "compressDictTestSh"); + + sharedReader.attach( + predicateReader, + objectReader, + subjectReader + ).startAll().joinAndCrashIfRequired(); + } + + static class TestCompressionResult implements CompressionResult { + private final CharSequence[] subjects; + private final CharSequence[] predicates; + private final CharSequence[] objects; + // used to create fake id to avoid duplicate assert error + private int sid, pid, oid; + + private final long size; + + public TestCompressionResult(CharSequence[] subjects, CharSequence[] predicates, CharSequence[] objects) { + this.subjects = subjects; + this.predicates = predicates; + this.objects = objects; + + size = Arrays.stream(subjects).mapToLong(s -> s.toString().getBytes(ByteStringUtil.STRING_ENCODING).length).sum() + + Arrays.stream(predicates).mapToLong(s -> s.toString().getBytes(ByteStringUtil.STRING_ENCODING).length).sum() + + Arrays.stream(objects).mapToLong(s -> s.toString().getBytes(ByteStringUtil.STRING_ENCODING).length).sum(); + } + + @Override + public long getTripleCount() { + return Math.max(subjects.length, Math.max(predicates.length, objects.length)); + } + + @Override + public ExceptionIterator getSubjects() { + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(subjects).iterator(), s -> new IndexedNode(s, sid++))); + } + + @Override + public ExceptionIterator getPredicates() { + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(predicates).iterator(), s -> new IndexedNode(s, pid++))); + } + + @Override + public ExceptionIterator getObjects() { + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(objects).iterator(), s -> new IndexedNode(s, oid++))); + } + + @Override + public long getSubjectsCount() { + return subjects.length; + } + + @Override + public long getPredicatesCount() { + return predicates.length; + } + + @Override + public long getObjectsCount() { + return objects.length; + } + + @Override + public long getSharedCount() { + return Math.min(subjects.length, objects.length); + } + + @Override + public void delete() { + } + + @Override + public long getRawSize() { + return size; + } + + @Override + public void close() { + } + } + + static class FakeNodeConsumer implements CompressFourSectionDictionary.NodeConsumer { + @Override + public void onSubject(long preMapId, long newMapId) { + } + + @Override + public void onPredicate(long preMapId, long newMapId) { + } + + @Override + public void onObject(long preMapId, long newMapId) { + } + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java new file mode 100644 index 00000000..693131c4 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java @@ -0,0 +1,69 @@ +package org.rdfhdt.hdt.dictionary.impl.section; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.io.compress.CompressUtil; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +public class OneReadDictionarySectionTest { + + @Test + public void sectionTest() { + List aa = Arrays.asList( + new IndexedNode("1", 1), + new IndexedNode("2", 2), + new IndexedNode("2", 3), + new IndexedNode("3", 4), + new IndexedNode("4", 5), + new IndexedNode("5", 6), + new IndexedNode("5", 7), + new IndexedNode("5", 8), + new IndexedNode("6", 9), + new IndexedNode("7", 10), + new IndexedNode("8", 11), + new IndexedNode("9", 12) + ); + + OneReadDictionarySection sec1 = new OneReadDictionarySection( + removeDupe(aa), + aa.size() + ); + assertIteratorEquals(removeDupe(aa), sec1.getSortedEntries()); + + OneReadDictionarySection sec2 = new OneReadDictionarySection( + removeDupe(aa), + aa.size() + ); + + PFCDictionarySection section = new PFCDictionarySection(new HDTSpecification()); + section.load(sec2, null); + + assertIteratorEquals(removeDupe(aa), section.getSortedEntries()); + } + + private void assertIteratorEquals(Iteratorit1, Iterator it2) { + while (it1.hasNext()) { + Assert.assertTrue(it2.hasNext()); + Assert.assertEquals(it1.next().toString(), it2.next().toString()); + } + Assert.assertFalse(it2.hasNext()); + } + + private Iterator removeDupe(List nodes) { + return + new MapIterator<>( + CompressUtil.asNoDupeCharSequenceIterator( + ExceptionIterator.of(nodes.iterator()), + (i, j, k) -> { + } + ), IndexedNode::getNode + ); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java new file mode 100644 index 00000000..cbedd70e --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -0,0 +1,654 @@ +package org.rdfhdt.hdt.hdt; + +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Suite; +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.enums.CompressionType; +import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.NotFoundException; +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.rdf.RDFFluxStop; +import org.rdfhdt.hdt.rdf.RDFParserFactory; +import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; +import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; +import org.rdfhdt.hdt.util.StopWatch; +import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressTest; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +import static org.junit.Assert.*; + +@RunWith(Suite.class) +@Suite.SuiteClasses({ + HDTManagerTest.DynamicDiskTest.class, + HDTManagerTest.DynamicCatTreeTest.class, + HDTManagerTest.StaticTest.class +}) +public class HDTManagerTest { + private static class HDTManagerTestBase extends AbstractMapMemoryTest implements ProgressListener { + protected static final long SIZE = 1L << 16; + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + protected HDTSpecification spec; + + @Before + public void setupManager() throws IOException { + spec = new HDTSpecification(); + spec.set("loader.disk.location", tempDir.newFolder().getAbsolutePath()); + } + + @Override + public void notifyProgress(float level, String message) { + // System.out.println("[" + level + "] " + message); + } + + protected void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundException { + assertEqualsHDT(expected, actual, 0); + } + protected void assertEqualsHDT(HDT expected, HDT actual, int ignoredHeader) throws NotFoundException { + + // test dictionary + Dictionary ed = expected.getDictionary(); + Dictionary ad = actual.getDictionary(); + assertEqualsHDT("Subjects", ed.getSubjects(), ad.getSubjects()); + assertEqualsHDT("Predicates", ed.getPredicates(), ad.getPredicates()); + assertEqualsHDT("Objects", ed.getObjects(), ad.getObjects()); + assertEqualsHDT("Shared", ed.getShared(), ad.getShared()); + assertEquals(ed.getType(), ad.getType()); + + // test triples + IteratorTripleString actualIt = actual.search("", "", ""); + IteratorTripleString expectedIt = expected.search("", "", ""); + + while (expectedIt.hasNext()) { + assertTrue(actualIt.hasNext()); + + TripleString expectedTriple = expectedIt.next(); + TripleString actualTriple = actualIt.next(); + assertEquals(expectedIt.getLastTriplePosition(), actualIt.getLastTriplePosition()); + assertEquals(expectedTriple, actualTriple); + } + assertFalse(actualIt.hasNext()); + + // test header + assertEquals(actual.getHeader().getBaseURI(), expected.getHeader().getBaseURI()); + if (expected.getHeader().getNumberOfElements() + ignoredHeader != actual.getHeader().getNumberOfElements()) { + StringBuilder bld = new StringBuilder(); + + bld.append("-------- Header excepted:"); + expected.getHeader().search(null, null, null).forEachRemaining(bld::append); + bld.append("-------- Header actual:"); + actual.getHeader().search(null, null, null).forEachRemaining(bld::append); + + fail("Size of the header doesn't match " + bld + expected.getHeader().getNumberOfElements() + " + " + ignoredHeader + "!=" + actual.getHeader().getNumberOfElements()); + } + } + + protected void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { + Iterator itEx = excepted.getSortedEntries(); + Iterator itAc = actual.getSortedEntries(); + + while (itEx.hasNext()) { + assertTrue(itAc.hasNext()); + CharSequence expectedTriple = itEx.next(); + CharSequence actualTriple = itAc.next(); + CompressTest.assertCharSequenceEquals(section + " section strings", expectedTriple, actualTriple); + } + assertFalse(itAc.hasNext()); + assertEquals(excepted.getNumberOfElements(), actual.getNumberOfElements()); + } + } + + @RunWith(Parameterized.class) + public static class DynamicDiskTest extends HDTManagerTestBase { + + @Parameterized.Parameters(name = "{0}") + public static Collection params() { + List params = new ArrayList<>(); + for (int threads : new int[]{ + // sync + 1, + // async, low thread count + 2, + // async, large thread count + 8 + }) { + List modes; + if (threads > 1) { + // async, no need for partial + modes = List.of( + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE + ); + } else { + modes = List.of( + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL, + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE + ); + } + for (String mode : modes) { + params.addAll(List.of( + new Object[]{"base-w" + threads + "-" + mode, SIZE * 8, 20, 50, threads, mode, false}, + new Object[]{"duplicates-w" + threads + "-" + mode, SIZE * 8, 10, 50, threads, mode, false}, + new Object[]{"large-literals-w" + threads + "-" + mode, SIZE * 2, 20, 250, threads, mode, false}, + new Object[]{"quiet-w" + threads + "-" + mode, SIZE * 8, 10, 50, threads, mode, false} + )); + } + } + return params; + } + + @Parameterized.Parameter + public String name; + @Parameterized.Parameter(1) + public long maxSize; + @Parameterized.Parameter(2) + public int maxElementSplit; + @Parameterized.Parameter(3) + public int maxLiteralSize; + @Parameterized.Parameter(4) + public int threads; + @Parameterized.Parameter(5) + public String compressMode; + @Parameterized.Parameter(6) + public boolean quiet; + + @Before + public void setupSpecs() { + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY, String.valueOf(threads)); + spec.set("loader.disk.compressMode", compressMode); + } + + private void generateDiskTest() throws IOException, ParserException, NotFoundException, InterruptedException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create DISK HDT + LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.GZIP); + HDT actual = null; + try { + actual = HDTManager.generateHDTDisk( + genActual.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + CompressionType.GZIP, + spec, + quiet ? null : this + ); + } finally { + if (actual == null) { + genActual.getThread().interrupt(); + } + } + genActual.getThread().joinAndCrashIfRequired(); + + supplier.reset(); + + LargeFakeDataSetStreamSupplier.ThreadedStream genExpected = supplier.createNTInputStream(CompressionType.GZIP); + // create MEMORY HDT + HDT expected = null; + try { + expected = HDTManager.generateHDT( + genExpected.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + CompressionType.GZIP, + spec, + null + ); + } finally { + if (expected == null) { + genExpected.getThread().interrupt(); + } + } + genExpected.getThread().joinAndCrashIfRequired(); + + // happy compiler, should throw before + assertNotNull(expected); + assertNotNull(actual); + try { + assertEqualsHDT(expected, actual); + } finally { + IOUtil.closeAll(expected, actual); + } + } + + @Test + public void generateSaveLoadMapTest() throws IOException, ParserException, NotFoundException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create MEMORY HDT + + try (HDT expected = HDTManager.generateHDT( + supplier.createTripleStringStream(), + HDTTestUtils.BASE_URI, + spec, + quiet ? null : this + )) { + String tmp = tempDir.newFile().getAbsolutePath(); + expected.saveToHDT(tmp, null); + + try (HDT mapExcepted = HDTManager.mapHDT(tmp, quiet ? null : this)) { + assertEqualsHDT(expected, mapExcepted); + } + + try (HDT loadExcepted = HDTManager.loadHDT(tmp, quiet ? null : this)) { + assertEqualsHDT(expected, loadExcepted); + } + } + + } + + @Test + public void generateDiskMemTest() throws IOException, ParserException, NotFoundException, InterruptedException { + spec.setInt("loader.disk.chunkSize", SIZE); + generateDiskTest(); + } + + @Test + public void generateDiskMapTest() throws IOException, ParserException, NotFoundException, InterruptedException { + spec.setInt("loader.disk.chunkSize", SIZE); + File mapHDT = tempDir.newFile("mapHDTTest.hdt"); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); + generateDiskTest(); + Files.deleteIfExists(mapHDT.toPath()); + } + + @Test + public void catTreeTest() throws IOException, ParserException, NotFoundException, InterruptedException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create DISK HDT + LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.NONE); + HDT actual = null; + try { + actual = HDTManager.catTree( + RDFFluxStop.sizeLimit(SIZE), + HDTSupplier.memory(), + genActual.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + quiet ? null : this + ); + } finally { + if (actual == null) { + genActual.getThread().interrupt(); + } + } + genActual.getThread().joinAndCrashIfRequired(); + + supplier.reset(); + + Iterator genExpected = supplier.createTripleStringStream(); + // create MEMORY HDT + HDT expected = HDTManager.generateHDT( + genExpected, + HDTTestUtils.BASE_URI, + spec, + null + ); + + // happy compiler, should throw before + assertNotNull(expected); + assertNotNull(actual); + try { + assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + } finally { + IOUtil.closeAll(expected, actual); + } + } + @Test + public void catTreeDiskTest() throws IOException, ParserException, NotFoundException, InterruptedException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create DISK HDT + LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.NONE); + HDT actual = null; + try { + actual = HDTManager.catTree( + RDFFluxStop.sizeLimit(SIZE), + HDTSupplier.disk(), + genActual.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + quiet ? null : this + ); + } finally { + if (actual == null) { + genActual.getThread().interrupt(); + } + } + genActual.getThread().joinAndCrashIfRequired(); + + supplier.reset(); + + Iterator genExpected = supplier.createTripleStringStream(); + // create MEMORY HDT + HDT expected = HDTManager.generateHDT( + genExpected, + HDTTestUtils.BASE_URI, + spec, + null + ); + + // happy compiler, should throw before + assertNotNull(expected); + assertNotNull(actual); + try { + assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + } finally { + IOUtil.closeAll(expected, actual); + } + } + } + + @RunWith(Parameterized.class) + public static class DynamicCatTreeTest extends HDTManagerTestBase { + + @Parameterized.Parameters(name = "{0}") + public static Collection params() { + return List.of( + new Object[]{"base", SIZE * 16, 20, 50, false}, + new Object[]{"duplicates", SIZE * 16, 10, 50, false}, + new Object[]{"large-literals", SIZE * 4, 20, 250, false}, + new Object[]{"quiet", SIZE * 16, 10, 50, false} + ); + } + + @Parameterized.Parameter + public String name; + @Parameterized.Parameter(1) + public long maxSize; + @Parameterized.Parameter(2) + public int maxElementSplit; + @Parameterized.Parameter(3) + public int maxLiteralSize; + @Parameterized.Parameter(4) + public boolean quiet; + + @Test + public void catTreeTest() throws IOException, ParserException, NotFoundException, InterruptedException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create DISK HDT + LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.NONE); + HDT actual = null; + HDT expected = null; + try { + try { + actual = HDTManager.catTree( + RDFFluxStop.sizeLimit(SIZE), + HDTSupplier.memory(), + genActual.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + quiet ? null : this + ); + } finally { + if (actual == null) { + genActual.getThread().interrupt(); + } + } + genActual.getThread().joinAndCrashIfRequired(); + + supplier.reset(); + + Iterator genExpected = supplier.createTripleStringStream(); + // create MEMORY HDT + expected = HDTManager.generateHDT( + genExpected, + HDTTestUtils.BASE_URI, + spec, + null + ); + + // happy compiler, should throw before + assertNotNull(expected); + assertNotNull(actual); + assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + } finally { + IOUtil.closeAll(expected, actual); + } + } + @Test + public void catTreeDiskTest() throws IOException, ParserException, NotFoundException, InterruptedException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create DISK HDT + LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.NONE); + HDT actual = null; + try { + actual = HDTManager.catTree( + RDFFluxStop.sizeLimit(SIZE), + HDTSupplier.disk(), + genActual.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + quiet ? null : this + ); + } finally { + if (actual == null) { + genActual.getThread().interrupt(); + } + } + genActual.getThread().joinAndCrashIfRequired(); + + supplier.reset(); + + Iterator genExpected = supplier.createTripleStringStream(); + // create MEMORY HDT + HDT expected = HDTManager.generateHDT( + genExpected, + HDTTestUtils.BASE_URI, + spec, + null + ); + + // happy compiler, should throw before + assertNotNull(expected); + assertNotNull(actual); + try { + assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + } finally { + IOUtil.closeAll(expected, actual); + } + } + } + @RunWith(Parameterized.class) + public static class StaticTest extends HDTManagerTestBase { + @Parameterized.Parameters(name = "{0}") + public static Collection params() { + return List.of( + new Object[]{"hdtGenDisk/unicode_disk_encode.nt", true} + ); + } + + @Parameterized.Parameter + public String file; + @Parameterized.Parameter(1) + public boolean quiet; + + + private void generateDiskTest() throws IOException, ParserException, NotFoundException { + String ntFile = Objects.requireNonNull(getClass().getClassLoader().getResource(file), "Can't find " + file).getFile(); + // create DISK HDT + HDT actual = HDTManager.generateHDTDisk( + ntFile, + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + quiet ? null : this + ); + + // create MEMORY HDT + HDT expected = HDTManager.generateHDT( + ntFile, + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + null + ); + + try { + assertEqualsHDT(expected, actual); + } finally { + IOUtil.closeAll(expected, actual); + } + } + + @Test + public void generateDiskCompleteTest() throws IOException, ParserException, NotFoundException { + spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_COMPLETE); + spec.setInt("loader.disk.chunkSize", SIZE); + generateDiskTest(); + } + + @Test + public void generateDiskPartialTest() throws IOException, ParserException, NotFoundException { + spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_PARTIAL); + spec.setInt("loader.disk.chunkSize", SIZE); + generateDiskTest(); + } + + @Test + public void generateDiskCompleteMapTest() throws IOException, ParserException, NotFoundException { + spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_COMPLETE); + spec.setInt("loader.disk.chunkSize", SIZE); + File mapHDT = tempDir.newFile("mapHDTTest.hdt"); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); + generateDiskTest(); + Files.deleteIfExists(mapHDT.toPath()); + } + + @Test + public void generateDiskPartialMapTest() throws IOException, ParserException, NotFoundException { + spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_PARTIAL); + spec.setInt("loader.disk.chunkSize", SIZE); + File mapHDT = tempDir.newFile("mapHDTTest.hdt"); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); + generateDiskTest(); + Files.deleteIfExists(mapHDT.toPath()); + } + + @Test + public void generateTest() throws IOException, ParserException, NotFoundException { + String ntFile = Objects.requireNonNull(getClass().getClassLoader().getResource(file), "Can't find " + file).getFile(); + // create DISK HDT + try (InputStream in = IOUtil.getFileInputStream(ntFile)) { + Iterator it = RDFParserFactory.readAsIterator( + RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES, true), + in, HDTTestUtils.BASE_URI, true, RDFNotation.NTRIPLES + ); + HDT expected = HDTManager.generateHDT( + it, + HDTTestUtils.BASE_URI, + spec, + quiet ? null : this + ); + + String testCopy = tempDir.newFile().getAbsolutePath(); + expected.saveToHDT(testCopy, null); + + // create MEMORY HDT + HDT actual = HDTManager.loadHDT(testCopy); + + try { + assertEqualsHDT(expected, actual); + } finally { + IOUtil.closeAll(expected, actual); + } + } + } + } + + @Ignore("handTests") + public static class HandTest extends HDTManagerTestBase { + @Test + public void bigDiskTest() throws ParserException, IOException { + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(10_000_000_000L, 94); + + Path output = tempDir.newFolder().toPath(); + + HDTOptions spec = new HDTSpecification(); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, output.resolve("future.hdt").toAbsolutePath().toString()); + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, output.resolve("gen_dir").toAbsolutePath().toString()); + StopWatch watch = new StopWatch(); + watch.reset(); + try (HDT hdt = HDTManager.generateHDTDisk(supplier.createTripleStringStream(), "http://ex.ogr/#", spec, + (level, message) -> System.out.println("[" + level + "] " + message) + )) { + System.out.println(watch.stopAndShow()); + System.out.println(hdt.getTriples().getNumberOfElements()); + } + } + @Test + public void bigCatTreeDiskTest() throws ParserException, IOException { + HDTOptions spec = new HDTSpecification(); + StopWatch watch = new StopWatch(); + spec.set(HDTOptionsKeys.LOADER_CATTREE_LOCATION_KEY, "C:\\WIKI\\CATTREE\\WORKING"); + spec.set(HDTOptionsKeys.LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY, "C:\\ISWC\\CATTREE\\future.hdt"); + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, "C:\\WIKI\\CATTREE\\WORKING_HDTDISK"); + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY, "12"); + spec.set(HDTOptionsKeys.NT_SIMPLE_PARSER_KEY, "true"); + watch.reset(); + try (HDT hdt = HDTManager.catTree( + RDFFluxStop.sizeLimit(100_000_000_000L) // 300GB free + .and(RDFFluxStop.countLimit(700_000_000L) // ~9GB maps + ), HDTSupplier.disk(), + "M:\\WIKI\\latest-all.nt.bz2", HDTTestUtils.BASE_URI, RDFNotation.NTRIPLES, spec, + (level, message) -> System.out.println("[" + level + "] " + message) + )) { + System.out.println(watch.stopAndShow()); + System.out.println(hdt.getTriples().getNumberOfElements()); + } + } + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java deleted file mode 100644 index bc7455bd..00000000 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HDTCatTreeTest.java +++ /dev/null @@ -1,208 +0,0 @@ -package org.rdfhdt.hdt.hdtCat; - -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Suite; -import org.rdfhdt.hdt.dictionary.Dictionary; -import org.rdfhdt.hdt.dictionary.DictionarySection; -import org.rdfhdt.hdt.enums.RDFNotation; -import org.rdfhdt.hdt.exceptions.NotFoundException; -import org.rdfhdt.hdt.exceptions.ParserException; -import org.rdfhdt.hdt.hdt.HDT; -import org.rdfhdt.hdt.hdt.HDTManager; -import org.rdfhdt.hdt.hdt.HDTSupplier; -import org.rdfhdt.hdt.listener.ProgressListener; -import org.rdfhdt.hdt.options.HDTOptions; -import org.rdfhdt.hdt.options.HDTSpecification; -import org.rdfhdt.hdt.rdf.RDFFluxStop; -import org.rdfhdt.hdt.triples.IteratorTripleString; -import org.rdfhdt.hdt.triples.TripleString; -import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; -import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; -import org.rdfhdt.hdt.util.StopWatch; -import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; -import org.rdfhdt.hdt.util.io.IOUtil; -import org.rdfhdt.hdt.util.string.CharSequenceComparator; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collection; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - -@RunWith(Suite.class) -@Suite.SuiteClasses({ - HDTCatTreeTest.DynamicTest.class -}) -public class HDTCatTreeTest { - private static class HDTManagerTestBase extends AbstractMapMemoryTest implements ProgressListener { - protected static final long SIZE = 1L << 15; - @Rule - public TemporaryFolder tempDir = new TemporaryFolder(); - protected Path workDir; - protected HDTSpecification spec; - - @Before - public void setupManager() throws IOException { - spec = new HDTSpecification(); - workDir = tempDir.newFolder().toPath(); - spec.set("loader.cattree.location", workDir.toAbsolutePath().toString()); - } - - @Override - public void notifyProgress(float level, String message) { - // System.out.println("[" + level + "] " + message); - } - - protected void assertEqualsHDT(HDT expected, HDT actual, int ignoredHeader) throws NotFoundException { - - // test dictionary - Dictionary ed = expected.getDictionary(); - Dictionary ad = actual.getDictionary(); - assertEqualsHDT("Subjects", ed.getSubjects(), ad.getSubjects()); - assertEqualsHDT("Predicates", ed.getPredicates(), ad.getPredicates()); - assertEqualsHDT("Objects", ed.getObjects(), ad.getObjects()); - assertEqualsHDT("Shared", ed.getShared(), ad.getShared()); - assertEquals(ed.getType(), ad.getType()); - - // test triples - IteratorTripleString actualIt = actual.search("", "", ""); - IteratorTripleString expectedIt = expected.search("", "", ""); - - while (expectedIt.hasNext()) { - assertTrue(actualIt.hasNext()); - - TripleString expectedTriple = expectedIt.next(); - TripleString actualTriple = actualIt.next(); - assertEquals(expectedIt.getLastTriplePosition(), actualIt.getLastTriplePosition()); - assertEquals(expectedTriple, actualTriple); - } - assertFalse(actualIt.hasNext()); - - // test header - assertEquals(expected.getHeader().getBaseURI(), actual.getHeader().getBaseURI()); - assertEquals(expected.getHeader().getNumberOfElements() + ignoredHeader, actual.getHeader().getNumberOfElements()); - } - - protected void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { - Iterator itEx = excepted.getSortedEntries(); - Iterator itAc = actual.getSortedEntries(); - Comparator csc = CharSequenceComparator.getInstance(); - - while (itEx.hasNext()) { - assertTrue(itAc.hasNext()); - CharSequence expectedTriple = itEx.next(); - CharSequence actualTriple = itAc.next(); - assertEquals(section + " section strings", 0, csc.compare(expectedTriple, actualTriple)); - } - assertFalse(itAc.hasNext()); - assertEquals(excepted.getNumberOfElements(), actual.getNumberOfElements()); - } - } - - @RunWith(Parameterized.class) - public static class DynamicTest extends HDTManagerTestBase { - - @Parameterized.Parameters(name = "{0}") - public static Collection params() { - return List.of( - new Object[]{"base", SIZE * 16, 20, 50, false}, - new Object[]{"duplicates", SIZE * 16, 10, 50, false}, - new Object[]{"large-literals", SIZE * 4, 20, 250, false}, - new Object[]{"quiet", SIZE * 16, 10, 50, false} - ); - } - - @Parameterized.Parameter - public String name; - @Parameterized.Parameter(1) - public long maxSize; - @Parameterized.Parameter(2) - public int maxElementSplit; - @Parameterized.Parameter(3) - public int maxLiteralSize; - @Parameterized.Parameter(4) - public boolean quiet; - - @Test - public void catTreeTest() throws IOException, ParserException, NotFoundException, InterruptedException { - LargeFakeDataSetStreamSupplier supplier = - LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(maxSize, 34) - .withMaxElementSplit(maxElementSplit) - .withMaxLiteralSize(maxLiteralSize); - - // create DISK HDT - LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(); - HDT actual = null; - try { - actual = HDTManager.catTree( - RDFFluxStop.sizeLimit(SIZE), - HDTSupplier.memory(), - genActual.getStream(), - HDTTestUtils.BASE_URI, - RDFNotation.NTRIPLES, - spec, - quiet ? null : this - ); - } finally { - if (actual == null) { - genActual.getThread().interrupt(); - } - } - genActual.getThread().joinAndCrashIfRequired(); - - supplier.reset(); - - Iterator genExpected = supplier.createTripleStringStream(); - // create MEMORY HDT - HDT expected = HDTManager.generateHDT( - genExpected, - HDTTestUtils.BASE_URI, - spec, - null - ); - - // happy compiler, should throw before - assertNotNull(expected); - assertNotNull(actual); - try { - assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat - } finally { - IOUtil.closeAll(expected, actual); - } - } - - } - - @Ignore("handTests") - public static class HandTest extends HDTManagerTestBase { - @Test - public void bigTest() throws ParserException, IOException { - LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(10_000_000_000L, 94); - - HDTOptions spec = new HDTSpecification(); - StopWatch watch = new StopWatch(); - watch.reset(); - try (HDT hdt = HDTManager.catTree(RDFFluxStop.sizeLimit(1_000_000_000), HDTSupplier.memory(), - supplier.createTripleStringStream(), HDTTestUtils.BASE_URI, spec, - (level, message) -> System.out.println("[" + level + "] " + message) - )) { - System.out.println(watch.stopAndShow()); - System.out.println(hdt.getTriples().getNumberOfElements()); - } - } - } -} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIteratorTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIteratorTest.java new file mode 100644 index 00000000..115ba86e --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIteratorTest.java @@ -0,0 +1,28 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.function.Function; + +import static org.junit.Assert.*; + +public class MergeExceptionIteratorTest { + + @Test + public void mergeTest() { + ExceptionIterator it1 = ExceptionIterator.of(Arrays.asList("1", "3", "5", "7").iterator()); + ExceptionIterator it2 = ExceptionIterator.of(Arrays.asList("2", "4", "6", "6").iterator()); + + ExceptionIterator it = MergeExceptionIterator.buildOfTree(Function.identity(), String::compareTo, List.of(it1, it2), 0, 2); + + ExceptionIterator itExcepted = ExceptionIterator.of(Arrays.asList("1", "2", "3", "4", "5", "6", "6", "7").iterator()); + + while (itExcepted.hasNext()) { + assertTrue(it.hasNext()); + assertEquals(itExcepted.next(), it.next()); + } + assertFalse(it.hasNext()); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index ca372111..4ec5523b 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -1,6 +1,10 @@ package org.rdfhdt.hdt.util; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; @@ -12,6 +16,7 @@ import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.io.PrintStream; @@ -20,6 +25,7 @@ import java.nio.file.Path; import java.util.Iterator; import java.util.Random; +import java.util.zip.GZIPOutputStream; public class LargeFakeDataSetStreamSupplier { @@ -95,12 +101,34 @@ public void createNTFile(String file) throws IOException { } } - public ThreadedStream createNTInputStream() throws IOException { + public ThreadedStream createNTInputStream(CompressionType compressionType) throws IOException { PipedOutputStream pout = new PipedOutputStream(); InputStream is = new PipedInputStream(pout); + OutputStream out; + + if (compressionType != null) { + switch (compressionType) { + case NONE: + out = pout; + break; + case XZ: + out = new XZCompressorOutputStream(pout); + break; + case BZIP: + out = new BZip2CompressorOutputStream(pout); + break; + case GZIP: + out = new GZIPOutputStream(pout); + break; + default: + throw new NotImplementedException(compressionType.name()); + } + } else { + out = pout; + } ExceptionThread run = new ExceptionThread(() -> { - try (PrintStream ps = new PrintStream(pout, true)) { + try (PrintStream ps = new PrintStream(out, true)) { Iterator it = createTripleStringStream(); while (it.hasNext()) { it.next().dumpNtriple(ps); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java new file mode 100644 index 00000000..cfc2be32 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java @@ -0,0 +1,48 @@ +package org.rdfhdt.hdt.util; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; +import org.rdfhdt.hdt.rdf.RDFParserFactory; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; + +public class LargeFakeDataSetStreamSupplierTest { + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + @Test + public void streamTest() throws IOException { + LargeFakeDataSetStreamSupplier triples = LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(10, 10); + Path f = tempDir.newFolder().toPath(); + Path testNt = f.resolve("test.nt"); + triples.createNTFile(testNt.toAbsolutePath().toString()); + + try (InputStream is = Files.newInputStream(testNt)) { + PipedCopyIterator it = RDFParserFactory.readAsIterator( + RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES), + is, + HDTTestUtils.BASE_URI, + true, + RDFNotation.NTRIPLES + ); + + it.forEachRemaining(s -> { + try { + Thread.sleep(50); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + System.out.println(s + " " + s.getSubject().getClass()); + }); + } + } +} \ No newline at end of file diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/KWayMergerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/KWayMergerTest.java new file mode 100644 index 00000000..89cbae90 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/KWayMergerTest.java @@ -0,0 +1,169 @@ +package org.rdfhdt.hdt.util.concurrent; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.SizeFetcher; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.*; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +@RunWith(Parameterized.class) +public class KWayMergerTest { + @Parameterized.Parameters(name = "{0}-worker-{1}-ways-({2}-{3})") + public static Collection params() { + final int element = 10_000; + final int split = 100; + final int hugeMemoryFactor = 4; + return List.of( + new Object[]{1, 8, split, element}, + new Object[]{2, 8, split, element}, + new Object[]{8, 8, split, element}, + new Object[]{1, 2, split, element}, + new Object[]{2, 2, split, element}, + new Object[]{8, 2, split, element}, + new Object[]{1, 8, split * hugeMemoryFactor, element}, + new Object[]{2, 8, split * hugeMemoryFactor, element}, + new Object[]{8, 8, split * hugeMemoryFactor, element}, + new Object[]{1, 2, split * hugeMemoryFactor, element}, + new Object[]{2, 2, split * hugeMemoryFactor, element}, + new Object[]{8, 2, split * hugeMemoryFactor, element} + ); + } + + @Parameterized.Parameter + public int workers; + @Parameterized.Parameter(1) + public int k; + @Parameterized.Parameter(2) + public int splitSize; + @Parameterized.Parameter(3) + public int elements; + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + @Test + public void simpleMerge() throws IOException, KWayMerger.KWayMergerException, InterruptedException { + try (CloseSuppressPath root = CloseSuppressPath.of(tempDir.newFolder().toPath())) { + root.closeWithDeleteRecurse(); + + Random rnd = new Random(64); + List values = IntStream + .iterate(2, s -> 1 + rnd.nextInt(elements * 10)) + .limit(elements).boxed() + .collect(Collectors.toCollection(ArrayList::new)); + + assert values.stream().mapToInt(c -> c).min().orElse(1) > 0; + + List expected = new ArrayList<>(values); + expected.sort(Integer::compareTo); + //assertNotEquals(values, expected); // uncomment it if you don't trust me... + + KWayMerger> merger = new KWayMerger<>(root, new AsyncIteratorFetcher<>(values.iterator()), new KWayMerger.KWayMergerImpl<>() { + @Override + public void createChunk(Supplier flux, CloseSuppressPath output) throws KWayMerger.KWayMergerException { + Integer v; + List obj = new ArrayList<>(); + while ((v = flux.get()) != null) { + obj.add(v); + } + obj.sort(Integer::compareTo); + + try (OutputStream os = output.openOutputStream(1024)) { + for (Integer i : obj) { + VByte.encode(os, i); + } + VByte.encode(os, 0); + } catch (IOException e) { + throw new KWayMerger.KWayMergerException(e); + } + } + + @Override + public void mergeChunks(List inputs, CloseSuppressPath output) throws KWayMerger.KWayMergerException { + try { + List> lists = new ArrayList<>(); + for (CloseSuppressPath path : inputs) { + List list = new ArrayList<>(); + try (InputStream is = path.openInputStream(1024)) { + while (true) { + long value = VByte.decode(is); + + if (value == 0) { + break; + } + + list.add((int) value); + } + } + lists.add(list); + } + ExceptionIterator merge = MergeExceptionIterator.buildOfTree( + e -> ExceptionIterator.of(e.iterator()), + Integer::compareTo, + lists, + 0, + lists.size() + ); + + try (OutputStream os = output.openOutputStream(1024)) { + while (merge.hasNext()) { + VByte.encode(os, merge.next()); + } + VByte.encode(os, 0); + } catch (IOException e) { + throw new KWayMerger.KWayMergerException(e); + } + IOUtil.closeAll(inputs); + } catch (IOException e) { + throw new KWayMerger.KWayMergerException(e); + } + } + + @Override + public Supplier newStopFlux(Supplier flux) { + return new SizeFetcher<>(flux, e -> 1, splitSize); + } + }, workers, k); + + merger.start(); + Optional paths = merger.waitResult(); + + assertFalse(paths.isEmpty()); + CloseSuppressPath end = paths.get(); + + List actual = new ArrayList<>(); + try (InputStream is = end.openInputStream(1024)) { + while (true) { + long value = VByte.decode(is); + + if (value == 0) { + break; + } + + actual.add((int) value); + } + } + + assertEquals(expected, actual); + } + + } + +} \ No newline at end of file diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/TreeWorkerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/TreeWorkerTest.java new file mode 100644 index 00000000..dd5bcc32 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/TreeWorkerTest.java @@ -0,0 +1,308 @@ +package org.rdfhdt.hdt.util.concurrent; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.util.BitUtil; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class TreeWorkerTest { + @Parameterized.Parameters(name = "test {0} worker(s) {1} way(s)") + public static Collection params() { + return Arrays.asList( + new Object[]{1, 1}, + new Object[]{8, 1}, + new Object[]{1, 4}, + new Object[]{8, 4} + ); + } + + private static class SyncSupplierTest implements TreeWorker.TreeWorkerSupplier { + private final int max; + private final long sleep; + private int val; + private boolean inUse = false; + + public SyncSupplierTest(int max, long sleep) { + this.max = max; + this.sleep = sleep; + } + + @Override + public Integer get() { + synchronized (this) { + assertFalse(inUse); + inUse = true; + } + sleepOrThrow(sleep); + synchronized (this) { + assertTrue(inUse); + inUse = false; + } + if (val == max) { + return null; + } + return ++val; + } + } + + private static Integer sum(Integer[] array, int count) { + int s = 0; + for (int i = 0; i < count; i++) { + s += array[i]; + } + return s; + } + + private static class CountCatTest implements TreeWorker.TreeWorkerCat { + int call = 0; + + @Override + public Integer construct(Integer[] array, int count) { + synchronized (this) { + call++; + } + return sum(array, count); + } + } + + private static class CountComparator implements Comparator { + int call = 0; + + @Override + public int compare(Integer o1, Integer o2) { + synchronized (this) { + call++; + } + return Integer.compare(o1, o2); + } + } + + private static class IntegerArrayList extends ArrayList { + } + + @Parameterized.Parameter + public int workers; + @Parameterized.Parameter(1) + public int ways; + + @Test + public void syncSupplierTest() throws InterruptedException, TreeWorker.TreeWorkerException { + TreeWorker.TreeWorkerCat cat = TreeWorkerTest::sum; + int max = 10; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 20L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(max * (max + 1) / 2, result.intValue()); + } + + @Test(expected = TreeWorker.TreeWorkerException.class) + public void noElementSupplierTest() throws TreeWorker.TreeWorkerException { + TreeWorker.TreeWorkerCat cat = TreeWorkerTest::sum; + int max = 0; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 20L); + + // should crash because the supplier won't return any value to merge + new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + } + + @Test + public void oneElementSupplierTest() throws InterruptedException, TreeWorker.TreeWorkerException { + TreeWorker.TreeWorkerCat cat = TreeWorkerTest::sum; + int max = 1; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 20L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(1, result.intValue()); + } + + @Test + public void catExceptionTest() throws InterruptedException, TreeWorker.TreeWorkerException { + final String error = "I like HDT"; + TreeWorker.TreeWorkerCat cat = (a, b) -> { + throw new RuntimeException(error); + }; + int max = 1; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 20L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + try { + worker.waitToComplete(); + } catch (TreeWorker.TreeWorkerException e) { + assertEquals(error, e.getCause().getMessage()); + } + assertTrue(worker.isCompleted()); + } + + @Test + public void countTest() throws InterruptedException, TreeWorker.TreeWorkerException { + CountCatTest cat = new CountCatTest(); + int max = 1 << 5; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 2L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(max * (max + 1) / 2, result.intValue()); + } + + @Test + public void countAscendTest() throws InterruptedException, TreeWorker.TreeWorkerException { + CountCatTest cat = new CountCatTest(); + int max = 1 << 5 - 1; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 2L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(max * (max + 1) / 2, result.intValue()); + } + + @Test + public void deleteTest() throws TreeWorker.TreeWorkerException, InterruptedException { + int max = 10; + Set elements = new HashSet<>(); + TreeWorker.TreeWorkerCat cat = (array, count) -> { + synchronized (elements) { + for (int i = 0; i < count; i++) { + elements.remove(array[i] * max); + } + int next = sum(array, count); + elements.add(next * max); + return next; + } + }; + TreeWorker.TreeWorkerSupplier supplier = new TreeWorker.TreeWorkerSupplier<>() { + int value = 0; + + @Override + public Integer get() { + if (value == max) { + return null; + } + int v = ++value; + synchronized (elements) { + elements.add(v * max); + } + return v; + } + }; + + TreeWorker.TreeWorkerDelete delete = elements::remove; + + TreeWorker worker = new TreeWorker<>(cat, supplier, delete, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(1, elements.size()); + assertEquals(result * max, elements.iterator().next().intValue()); + assertEquals(max * (max + 1) / 2, result.intValue()); + } + + @Test + public void mergeSortTest() throws TreeWorker.TreeWorkerException, InterruptedException { + Random rnd = new Random(42); + int count = 20; + int maxValue = Integer.MAX_VALUE / 4; + List values = new ArrayList<>(); + List lst = new ArrayList<>(); + for (int i = 0; i < count; i++) { + int v = rnd.nextInt(maxValue); + values.add(v); + lst.add(v); + } + assertEquals(lst, values); + List sorted = lst.stream() + .map(i -> i * 3) + .sorted(Comparator.comparingInt(a -> a)) + .collect(Collectors.toList()); + assertNotEquals(sorted, values); + CountComparator com = new CountComparator(); + assertTrue(com.compare(1325939940, -1360544799) > 0); + assertTrue(com.compare(2, 1) > 0); + assertTrue(com.compare(-3, -2) < 0); + assertTrue(com.compare(-2, -3) > 0); + com.call = 0; + TreeWorker worker = new TreeWorker<>( + (IntegerArrayList[] array, int length) -> { + Iterator it = MergeExceptionIterator.buildOfTree( + l -> ExceptionIterator.of(l.iterator()), + com, + array, length).asIterator(); + IntegerArrayList l = new IntegerArrayList(); + while (it.hasNext()) { + l.add(it.next()); + } + IntegerArrayList tst = new IntegerArrayList(); + tst.addAll(l); + tst.sort(Integer::compareTo); + sleepOrThrow(25); + assertEquals(tst, l); + return l; + }, + new TreeWorker.TreeWorkerSupplier<>() { + int index; + + @Override + public IntegerArrayList get() { + if (index == values.size()) { + return null; + } + IntegerArrayList l = new IntegerArrayList(); + l.add(values.get(index++)); + sleepOrThrow(25); + return l; + } + }, + null, v -> v.stream() + .map(i -> i * 3) + .collect(Collectors.toCollection(IntegerArrayList::new)), IntegerArrayList[]::new, workers, ways + ); + worker.start(); + List result = worker.waitToComplete(); + // test O(n log(n)) + assertTrue("calls: " + com.call + ", n logn : " + count * BitUtil.log2(count), com.call <= count * BitUtil.log2(count)); + assertEquals(sorted, result); + } + + private static void sleepOrThrow(long time) { + try { + Thread.sleep(time); + } catch (InterruptedException e) { + throw new AssertionError("Interruption", e); + } + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java index 00f3a662..791f3a3a 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java @@ -5,13 +5,22 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.Closeable; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import org.junit.Assert; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; public class IOUtilTest { + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + @Before public void setUp() throws Exception { } @@ -20,28 +29,28 @@ public void setUp() throws Exception { public void testWriteLong() { try { ByteArrayOutputStream bout = new ByteArrayOutputStream(); - + IOUtil.writeLong(bout, 3); IOUtil.writeLong(bout, 4); IOUtil.writeLong(bout, 0xFF000000000000AAL); IOUtil.writeLong(bout, 0x33AABBCCDDEEFF11L); - + ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray()); - + long a = IOUtil.readLong(bin); assertEquals(a, 3); - + long b = IOUtil.readLong(bin); assertEquals(b, 4); - + long c = IOUtil.readLong(bin); assertEquals(c, 0xFF000000000000AAL); - + long d = IOUtil.readLong(bin); assertEquals(d, 0x33AABBCCDDEEFF11L); - + } catch (IOException e) { - fail("Exception thrown: "+e); + fail("Exception thrown: " + e); } } @@ -49,28 +58,119 @@ public void testWriteLong() { public void testWriteInt() { try { ByteArrayOutputStream bout = new ByteArrayOutputStream(); - + IOUtil.writeInt(bout, 3); IOUtil.writeInt(bout, 4); IOUtil.writeInt(bout, 0xFF0000AA); IOUtil.writeInt(bout, 0xAABBCCDD); - + ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray()); - + long a = IOUtil.readInt(bin); assertEquals(a, 3); - + long b = IOUtil.readInt(bin); assertEquals(b, 4); - + long c = IOUtil.readInt(bin); assertEquals(c, 0xFF0000AA); - + long d = IOUtil.readInt(bin); assertEquals(d, 0xAABBCCDD); - + } catch (IOException e) { - fail("Exception thrown: "+e); + fail("Exception thrown: " + e); + } + } + + @Test(expected = IOException.class) + public void closeAllSeverity11Test() throws IOException { + IOUtil.closeAll( + () -> { + throw new IOException(); + }, + () -> { + throw new IOException(); + }, + () -> { + throw new IOException(); + } + ); + } + + @Test(expected = IOException.class) + public void closeAllSeverity12Test() throws IOException { + IOUtil.closeAll( + (Closeable) () -> { + throw new IOException(); + } + ); + } + + @Test(expected = IOException.class) + public void closeAllSeverity13Test() throws IOException { + IOUtil.closeAll( + () -> { + throw new IOException(); + }, + () -> { + throw new IOException(); + } + ); + } + + @Test(expected = RuntimeException.class) + public void closeAllSeverity2Test() throws IOException { + IOUtil.closeAll( + () -> { + throw new IOException(); + }, + () -> { + throw new RuntimeException(); + }, + () -> { + throw new IOException(); + } + ); + } + + @Test(expected = Error.class) + public void closeAllSeverity3Test() throws IOException { + IOUtil.closeAll( + () -> { + throw new Error(); + }, + () -> { + throw new RuntimeException(); + }, + () -> { + throw new IOException(); + } + ); + } + + @Test + public void closeablePathTest() throws IOException { + Path p = tempDir.newFolder().toPath(); + + Path p1 = p.resolve("test1"); + try (CloseSuppressPath csp = CloseSuppressPath.of(p1)) { + Files.writeString(csp.getJavaPath(), "test"); + Assert.assertTrue(Files.exists(p1)); } + Assert.assertFalse(Files.exists(p1)); + + + Path p2 = p.resolve("test2"); + try (CloseSuppressPath csp = CloseSuppressPath.of(p2)) { + csp.closeWithDeleteRecurse(); + Path p3 = csp.getJavaPath().resolve("test3/test4/test5"); + Path f4 = p3.resolve("child.txt"); + Files.createDirectories(p3); + Files.writeString(f4, "hello world"); + Assert.assertTrue(Files.exists(f4)); + } + Assert.assertFalse(Files.exists(p2)); + } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java new file mode 100644 index 00000000..b6a85e98 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java @@ -0,0 +1,257 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; + +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.util.Arrays; +import java.util.List; + +public class CompressNodeTest { + + @Test + public void writeReadTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List nodes = Arrays.asList( + new IndexedNode("bob", 1), + new IndexedNode("michel", 3), + new IndexedNode("jack", 2), + new IndexedNode("charles", 6) + ); + new ExceptionThread(() -> { + CompressNodeReader reader = new CompressNodeReader(in); + Assert.assertEquals(nodes.size(), reader.getSize()); + try { + for (IndexedNode excepted : nodes) { + Assert.assertTrue(reader.hasNext()); + IndexedNode actual = reader.next(); + Assert.assertEquals(excepted.getIndex(), actual.getIndex()); + CompressTest.assertCharSequenceEquals("indexed node", excepted.getNode(), actual.getNode()); + } + reader.checkComplete(); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + CompressNodeWriter writer = new CompressNodeWriter(out, nodes.size()); + try { + for (IndexedNode node : nodes) { + writer.appendNode(node); + } + writer.writeCRC(); + // raw data to check if we didn't read too/not enough data + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadUtilTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List nodes = Arrays.asList( + new IndexedNode("bob", 1), + new IndexedNode("michel", 3), + new IndexedNode("jack", 2), + new IndexedNode("charles", 6) + ); + new ExceptionThread(() -> { + CompressNodeReader reader = new CompressNodeReader(in); + Assert.assertEquals(nodes.size(), reader.getSize()); + try { + for (IndexedNode excepted : nodes) { + Assert.assertTrue(reader.hasNext()); + IndexedNode actual = reader.next(); + Assert.assertEquals(excepted.getIndex(), actual.getIndex()); + CompressTest.assertCharSequenceEquals("indexed node", excepted.getNode(), actual.getNode()); + } + reader.checkComplete(); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + try { + CompressUtil.writeCompressedSection(nodes, out, null); + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadPassTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List nodes = Arrays.asList( + new IndexedNode("bob", 1), + new IndexedNode("michel", 3), + new IndexedNode("jack", 2), + new IndexedNode("charles", 6) + ); + new ExceptionThread(() -> { + CompressNodeReader reader = new CompressNodeReader(in); + Assert.assertEquals(nodes.size(), reader.getSize()); + try { + for (IndexedNode excepted : nodes) { + Assert.assertTrue(reader.hasNext()); + IndexedNode actual = reader.read(); + Assert.assertEquals(excepted.getIndex(), actual.getIndex()); + CompressTest.assertCharSequenceEquals("indexed node", excepted.getNode(), actual.getNode()); + String actual1Node = actual.getNode().toString(); + IndexedNode actual2 = reader.read(); + Assert.assertEquals(actual.getIndex(), actual2.getIndex()); + CompressTest.assertCharSequenceEquals("post pass indexed node", actual1Node, actual2.getNode()); + Assert.assertTrue(reader.hasNext()); + reader.pass(); + } + reader.checkComplete(); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + CompressNodeWriter writer = new CompressNodeWriter(out, nodes.size()); + try { + for (IndexedNode node : nodes) { + writer.appendNode(node); + } + writer.writeCRC(); + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadMergeTest() throws InterruptedException, IOException { + PipedOutputStream node1Out = new PipedOutputStream(); + PipedInputStream node1In = new PipedInputStream(); + node1Out.connect(node1In); + + PipedOutputStream node2Out = new PipedOutputStream(); + PipedInputStream node2In = new PipedInputStream(); + node2Out.connect(node2In); + + PipedOutputStream finalOut = new PipedOutputStream(); + PipedInputStream finalIn = new PipedInputStream(); + finalOut.connect(finalIn); + + List nodes1 = Arrays.asList( + new IndexedNode("zzzaaa", 1), + new IndexedNode("zzzccc", 2), + new IndexedNode("zzzddd", 6) + ); + List nodes2 = Arrays.asList( + new IndexedNode("zzzbbb", 3), + new IndexedNode("zzzeee", 4), + new IndexedNode("zzzfff", 5), + new IndexedNode("zzzggg", 7) + ); + List finalExcepted = Arrays.asList( + new IndexedNode("zzzaaa", 1), + new IndexedNode("zzzbbb", 3), + new IndexedNode("zzzccc", 2), + new IndexedNode("zzzddd", 6), + new IndexedNode("zzzeee", 4), + new IndexedNode("zzzfff", 5), + new IndexedNode("zzzggg", 7) + ); + new ExceptionThread(() -> { + CompressNodeReader reader = new CompressNodeReader(finalIn); + Assert.assertEquals(finalExcepted.size(), reader.getSize()); + try { + for (IndexedNode excepted : finalExcepted) { + Assert.assertTrue(reader.hasNext()); + IndexedNode actual = reader.next(); + Assert.assertEquals(excepted.getIndex(), actual.getIndex()); + CompressTest.assertCharSequenceEquals("merged node", excepted.getNode(), actual.getNode()); + } + reader.checkComplete(); + Assert.assertEquals(98, finalIn.read()); + Assert.assertEquals(18, finalIn.read()); + Assert.assertEquals(22, finalIn.read()); + } finally { + finalIn.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + try { + CompressUtil.writeCompressedSection(nodes1, node1Out, null); + node1Out.write(34); + node1Out.write(12); + node1Out.write(27); + } finally { + node1Out.close(); + } + }, "Write1Test"), + new ExceptionThread(() -> { + try { + CompressUtil.writeCompressedSection(nodes2, node2Out, null); + node2Out.write(42); + node2Out.write(19); + node2Out.write(1); + } finally { + node2Out.close(); + } + }, "Write2Test"), + new ExceptionThread(() -> { + try { + CompressUtil.mergeCompressedSection(node1In, node2In, finalOut, null); + finalOut.write(98); + finalOut.write(18); + finalOut.write(22); + + Assert.assertEquals(34, node1In.read()); + Assert.assertEquals(12, node1In.read()); + Assert.assertEquals(27, node1In.read()); + + Assert.assertEquals(42, node2In.read()); + Assert.assertEquals(19, node2In.read()); + Assert.assertEquals(1, node2In.read()); + } finally { + try { + node1In.close(); + } finally { + try { + node2In.close(); + } finally { + finalOut.close(); + } + } + } + }, "MergeTest") + ).startAll().joinAndCrashIfRequired(); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java new file mode 100644 index 00000000..1ba68115 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java @@ -0,0 +1,75 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +public class CompressTest { + public static void assertCharSequenceEquals(String location, CharSequence s1, CharSequence s2) { + if (CharSequenceComparator.getInstance().compare(s1, s2) != 0) { + throw new AssertionError(location + + "\nexcepted: " + s1 + + "\nactual: " + s2 + ); + } + } + + @Test + public void noDupeTest() { + List duplicatedList = Arrays.asList( + new IndexedNode("a", 0), + new IndexedNode("b", 1), + new IndexedNode("b", 2), + new IndexedNode("c", 3), + new IndexedNode("c", 4), + new IndexedNode("c", 5), + new IndexedNode("d", 6), + new IndexedNode("e", 7), + new IndexedNode("f", 8) + ); + List noDuplicatedList = Arrays.asList( + "a", + "b", + "c", + "d", + "e", + "f" + ); + + Set duplicates = new HashSet<>(); + duplicates.add(2L); + duplicates.add(4L); + duplicates.add(5L); + + Iterator actual = CompressUtil.asNoDupeCharSequenceIterator( + ExceptionIterator.of(duplicatedList.iterator()), + (originalIndex, duplicatedIndex, oldIndex) -> + Assert.assertTrue(duplicates.remove(duplicatedIndex)) + ); + for (CharSequence e : noDuplicatedList) { + Assert.assertTrue(actual.hasNext()); + CharSequence a = actual.next().getNode(); + + assertCharSequenceEquals("noDupeTest", e, a); + } + } + + @Test + public void bitMappingTest() { + long sharedCount = 1000L; + long index1 = 888L; + + long sharedIndex1 = CompressUtil.asShared(index1); + + Assert.assertEquals(index1, CompressUtil.computeSharedNode(sharedIndex1, sharedCount)); + Assert.assertEquals(sharedCount + index1, CompressUtil.computeSharedNode(CompressUtil.getHeaderId(index1), sharedCount)); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTripleTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTripleTest.java new file mode 100644 index 00000000..360eba68 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTripleTest.java @@ -0,0 +1,206 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.triples.IndexedTriple; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; + +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +public class CompressTripleTest { + @Test + public void writeReadTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List triples = Arrays.asList( + new IndexedTriple( + new IndexedNode("", 1), + new IndexedNode("", 9), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 1), + new IndexedNode("", 9), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 3), + new IndexedNode("", 10), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 2), + new IndexedNode("", 12), + new IndexedNode("", 15) + ), + new IndexedTriple( + new IndexedNode("", 2), + new IndexedNode("", 12), + new IndexedNode("", 15) + ), + new IndexedTriple( + new IndexedNode("", 6), + new IndexedNode("", 14), + new IndexedNode("", 13) + ) + ); + List noDupeTriples = Arrays.asList( + new IndexedTriple( + new IndexedNode("", 1), + new IndexedNode("", 9), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 3), + new IndexedNode("", 10), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 2), + new IndexedNode("", 12), + new IndexedNode("", 15) + ), + new IndexedTriple( + new IndexedNode("", 6), + new IndexedNode("", 14), + new IndexedNode("", 13) + ) + ); + new ExceptionThread(() -> { + CompressTripleReader reader = new CompressTripleReader(in); + try { + for (IndexedTriple exceptedIndex : noDupeTriples) { + Assert.assertTrue(reader.hasNext()); + TripleID actual = reader.next(); + TripleID excepted = new TripleID( + exceptedIndex.getSubject().getIndex(), + exceptedIndex.getPredicate().getIndex(), + exceptedIndex.getObject().getIndex() + ); + Assert.assertEquals(excepted, actual); + } + Assert.assertFalse(reader.hasNext()); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + CompressTripleWriter writer = new CompressTripleWriter(out); + try { + for (IndexedTriple triple : triples) { + writer.appendTriple(triple); + } + writer.writeCRC(); + // raw data to check if we didn't read too/not enough data + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadTripleIDTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List triples = Arrays.asList( + new TripleID(1, 9, 11), + new TripleID(1, 9, 11), + new TripleID(3, 10, 11), + new TripleID(2, 12, 15), + new TripleID(2, 12, 15), + new TripleID(6, 14, 13) + ); + List noDupeTriples = Arrays.asList( + new TripleID(1, 9, 11), + new TripleID(3, 10, 11), + new TripleID(2, 12, 15), + new TripleID(6, 14, 13) + ); + new ExceptionThread(() -> { + CompressTripleReader reader = new CompressTripleReader(in); + try { + for (TripleID excepted : noDupeTriples) { + Assert.assertTrue(reader.hasNext()); + TripleID actual = reader.next(); + Assert.assertEquals(excepted, actual); + } + Assert.assertFalse(reader.hasNext()); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + CompressTripleWriter writer = new CompressTripleWriter(out); + try { + for (TripleID triple : triples) { + writer.appendTriple(triple); + } + writer.writeCRC(); + // raw data to check if we didn't read too/not enough data + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadMergeTest() { + List triples1 = Arrays.asList( + new TripleID(2, 2, 2), + new TripleID(4, 4, 4), + new TripleID(5, 5, 5) + ); + List triples2 = Arrays.asList( + new TripleID(1, 1, 1), + new TripleID(3, 3, 3), + new TripleID(6, 6, 6) + ); + List triplesFinal = Arrays.asList( + new TripleID(1, 1, 1), + new TripleID(2, 2, 2), + new TripleID(3, 3, 3), + new TripleID(4, 4, 4), + new TripleID(5, 5, 5), + new TripleID(6, 6, 6) + ); + Iterator actual = new CompressTripleMergeIterator( + ExceptionIterator.of(triples1.iterator()), + ExceptionIterator.of(triples2.iterator()), + TripleComponentOrder.SPO + ).asIterator(); + Iterator expected = triplesFinal.iterator(); + + expected.forEachRemaining(tid -> { + Assert.assertTrue(actual.hasNext()); + Assert.assertEquals(tid, actual.next()); + }); + Assert.assertFalse(actual.hasNext()); + + } + +} diff --git a/hdt-java-core/src/test/resources/hdtGenDisk/unicode_disk_encode.nt b/hdt-java-core/src/test/resources/hdtGenDisk/unicode_disk_encode.nt new file mode 100644 index 00000000..b7498b13 --- /dev/null +++ b/hdt-java-core/src/test/resources/hdtGenDisk/unicode_disk_encode.nt @@ -0,0 +1,2 @@ + "d\u00A0normal"@nl . + "d\u00C2\u00A0normal"@dv . From f4a88eb5b9cf4ab2713e9186296b037eefb69b1c Mon Sep 17 00:00:00 2001 From: qaate47 Date: Fri, 21 Oct 2022 18:31:17 +0200 Subject: [PATCH 3/9] Better profiling, use "loader.type" option with disk, delta node merger, cat, allow to setup RDFFluxStop/HDTSupplier in options, allow HTTPs in generateHDT and fix baseURI for Windows --- .../java/org/rdfhdt/hdt/hdt/HDTSupplier.java | 32 ++ .../rdfhdt/hdt/options/HDTOptionsKeys.java | 55 +- .../java/org/rdfhdt/hdt/rdf/RDFFluxStop.java | 490 +++++++++++++----- .../java/org/rdfhdt/hdt/tools/RDF2HDT.java | 11 +- .../compact/bitmap/AppendableWriteBitmap.java | 2 +- .../impl/section/WriteDictionarySection.java | 2 +- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 139 ++++- .../impl/diskimport/SectionCompressor.java | 3 +- .../IndexNodeDeltaMergeExceptionIterator.java | 363 +++++++++++++ .../hdt/iterator/utils/PipedCopyIterator.java | 390 +++++++------- .../rdfhdt/hdt/options/HideHDTOptions.java | 57 ++ .../org/rdfhdt/hdt/triples/IndexedNode.java | 1 + .../java/org/rdfhdt/hdt/util/Profiler.java | 167 +++++- .../util/io/CloseSuppressFileProvider.java | 171 ++++++ .../hdt/util/io/CloseSuppressFileSystem.java | 81 +++ .../rdfhdt/hdt/util/io/CloseSuppressPath.java | 118 +++-- .../java/org/rdfhdt/hdt/util/io/IOUtil.java | 2 +- .../util/io/compress/CompressNodeReader.java | 21 +- .../hdt/util/io/compress/CompressUtil.java | 32 +- .../org/rdfhdt/hdt/hdt/HDTManagerTest.java | 65 ++- ...exNodeDeltaMergeExceptionIteratorTest.java | 348 +++++++++++++ .../org/rdfhdt/hdt/rdf/RDFFluxStopTest.java | 65 +++ .../LargeFakeDataSetStreamSupplierTest.java | 22 +- .../org/rdfhdt/hdt/util/ProfilerTest.java | 139 +++++ .../hdt/util/io/CloseSuppressPathTest.java | 54 ++ .../org/rdfhdt/hdt/util/io/IOUtilTest.java | 4 +- .../util/string/AssertionCharSequence.java | 62 +++ .../string/AssertionCharSequenceTest.java | 43 ++ 28 files changed, 2499 insertions(+), 440 deletions(-) create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HideHDTOptions.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressFileProvider.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressFileSystem.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/RDFFluxStopTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/ProfilerTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/CloseSuppressPathTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequence.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequenceTest.java diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java index 85f1821d..02b5bca3 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java @@ -8,7 +8,9 @@ import java.io.IOException; import java.nio.file.Path; +import java.util.HashMap; import java.util.Iterator; +import java.util.Map; /** * Interface describing an HDT generator method @@ -17,6 +19,12 @@ */ @FunctionalInterface public interface HDTSupplier { + Map SUPPLIERS = new HashMap<>() { + { + put(HDTOptionsKeys.LOADER_CATTREE_HDT_SUPPLIER_VALUE_MEMORY, memory()); + put(HDTOptionsKeys.LOADER_CATTREE_HDT_SUPPLIER_VALUE_DISK, disk()); + } + }; /** * @return implementation using in-memory hdt */ @@ -38,6 +46,30 @@ static org.rdfhdt.hdt.hdt.HDTSupplier disk() { }; } + /** + * create a HDTSupplier from spec + * @param spec the specs + * @return hdt supplier + */ + static HDTSupplier fromSpec(HDTOptions spec) { + if (spec == null) { + return memory(); + } + String supplier = spec.get(HDTOptionsKeys.HDT_SUPPLIER_KEY); + + if (supplier == null || supplier.isEmpty()) { + return memory(); + } + + HDTSupplier s = SUPPLIERS.get(supplier); + + if (s == null) { + throw new IllegalArgumentException("Can't find a supplier for name: " + supplier); + } + + return s; + } + /** * Generate the HDT * diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java index dd680697..af942c07 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java @@ -1,5 +1,7 @@ package org.rdfhdt.hdt.options; +import org.rdfhdt.hdt.rdf.RDFFluxStop; + /** * keys usable with {@link org.rdfhdt.hdt.options.HDTOptions#set(String, String)} * @author Antoine Willerval @@ -61,9 +63,17 @@ public class HDTOptionsKeys { * Key for the loading mode of a RDF file for the * {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDT(String, String, org.rdfhdt.hdt.enums.RDFNotation, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)} * method, this key isn't working with the other methods. - * Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS} or {@link #LOADER_TYPE_VALUE_TWO_PASS}. + * Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS}, {@link #LOADER_TYPE_VALUE_TWO_PASS}, {@link #LOADER_TYPE_VALUE_CAT} or {@link #LOADER_TYPE_VALUE_DISK}. */ public static final String LOADER_TYPE_KEY = "loader.type"; + /** + * Value for {@link #LOADER_TYPE_KEY}, read using disk generation, reduce the RAM usage and increase disk usage + */ + public static final String LOADER_TYPE_VALUE_DISK = "disk"; + /** + * Value for {@link #LOADER_TYPE_KEY}, read using HDTCat generation, merge using HDTCat HDT, reduce the RAM usage + */ + public static final String LOADER_TYPE_VALUE_CAT = "cat"; /** * Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage */ @@ -78,16 +88,59 @@ public class HDTOptionsKeys { * set this option to delete the directory in case of an interruption of the process. file value. */ public static final String LOADER_CATTREE_LOCATION_KEY = "loader.cattree.location"; + /** + * Same as {@link #LOADER_TYPE_KEY} for loader in the CATTREE method + */ + public static final String LOADER_CATTREE_LOADERTYPE_KEY = "loader.cattree.loadertype"; /** * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} catTree methods, * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value. */ public static final String LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY = "loader.cattree.futureHDTLocation"; + /** + * Key for the fault factor for the {@link org.rdfhdt.hdt.hdt.HDTManager} catTree default value of the + * split size of the RDFFluxStop in the generateHDT method. + */ + public static final String LOADER_CATTREE_MEMORY_FAULT_FACTOR = "loader.cattree.memoryFaultFactor"; + /** + * Key for the hdt supplier type, default to memory + */ + public static final String HDT_SUPPLIER_KEY = "supplier.type"; + /** + * Value for {@link #HDT_SUPPLIER_KEY}, use HDTGenDisk to create the HDT + */ + public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_DISK = "disk"; + /** + * Value for {@link #HDT_SUPPLIER_KEY}, use the default memory implementation to create the HDT + */ + public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_MEMORY = "memory"; + /** + * Key for the rdf flux stop type, default to the maximum memory allocated + */ + public static final String RDF_FLUX_STOP_KEY = "rdffluxstop.type"; + /** + * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier + */ + public static final String RDF_FLUX_STOP_VALUE_SIZE = "size"; + /** + * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier + */ + public static final String RDF_FLUX_STOP_VALUE_COUNT = "count"; + /** + * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier + */ + public static final String RDF_FLUX_STOP_VALUE_NO_LIMIT = "no_limit"; + + /** * Key for enabling the profiler (if implemented), default to false. Boolean value */ public static final String PROFILER_KEY = "profiler"; + /** + * Key for the profiler output (if implemented). File value + */ + public static final String PROFILER_OUTPUT_KEY = "profiler.output"; /** * Key for enabling the canonical NTriple file simple parser, default to false. Boolean value */ diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java index d3ce6f8b..db61b8e8 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java @@ -1,146 +1,370 @@ package org.rdfhdt.hdt.rdf; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.TripleString; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import java.util.function.BiFunction; import java.util.function.BinaryOperator; +import java.util.function.LongFunction; /** * Rdf flux stopper descriptor + * * @author Antoine Willerval */ -public interface RDFFluxStop { - /** - * @return basic implementation without any limit - */ - static RDFFluxStop noLimit() { - return new RDFFluxStop() { - @Override - public boolean canHandle(TripleString ts) { - return true; - } - - @Override - public void restart() { - // nothing - } - }; - } - - /** - * implementation of flux stop stopping after a maximum triple count - * - * @param maxTriple maximum count - * @return FluxStop - */ - static RDFFluxStop countLimit(long maxTriple) { - if (maxTriple <= 0) { - throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); - } - return new RDFFluxStop() { - long current = 0; - - @Override - public boolean canHandle(TripleString ts) { - return current++ < maxTriple; - } - - @Override - public void restart() { - current = 0; - } - }; - } - - /** - * implementation of flux stop stopping after a maximum NTriple size - * - * @param maxSize maximum size - * @return FluxStop - */ - static RDFFluxStop sizeLimit(long maxSize) { - if (maxSize <= 0) { - throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); - } - return new RDFFluxStop() { - long size = 0; - - @Override - public boolean canHandle(TripleString ts) { - long tsSize; - try { - tsSize = ts.asNtriple().toString().getBytes(StandardCharsets.UTF_8).length; - } catch (IOException e) { - throw new RuntimeException("Can't estimate the size of the triple " + ts, e); - } - try { - return size < maxSize; - } finally { - size += tsSize; - } - } - - @Override - public void restart() { - size = 0; - } - }; - } - - /** - * should we stop the flux after this triple or not? - * - * @param ts the triple - * @return true if the flux can handle this triple, false otherwise - */ - boolean canHandle(TripleString ts); - - /** - * restart the flux stop - */ - void restart(); - - /** - * combine 2 rdf flux stop with a boolean operation - * @param fluxStop the other flux stop - * @param operator the operator - * @return rdffluxstop - * @see #and(RDFFluxStop) - * @see #or(RDFFluxStop) - */ - default RDFFluxStop booleanOp(RDFFluxStop fluxStop, BinaryOperator operator) { - return new RDFFluxStop() { - @Override - public boolean canHandle(TripleString ts) { - boolean left = RDFFluxStop.this.canHandle(ts); - boolean right = fluxStop.canHandle(ts); - return operator.apply(left, right); - } - - @Override - public void restart() { - RDFFluxStop.this.restart(); - fluxStop.restart(); - } - }; - } - - /** - * {@link #booleanOp(RDFFluxStop, BinaryOperator)} version for AND - * @param fluxStop other flux stop - * @return rdffluxstop - */ - default RDFFluxStop and(RDFFluxStop fluxStop) { - return booleanOp(fluxStop, (a, b) -> a && b); - } - - /** - * {@link #booleanOp(RDFFluxStop, BinaryOperator)} version for OR - * @param fluxStop other flux stop - * @return rdffluxstop - */ - default RDFFluxStop or(RDFFluxStop fluxStop) { - return booleanOp(fluxStop, (a, b) -> a || b); - } +public abstract class RDFFluxStop { + private static final Map> BUILDER = new HashMap<>(); + private static final Map> BUILDER_OP = new HashMap<>(); + + static { + registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT, RDFFluxStop::countLimit); + registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE, RDFFluxStop::sizeLimit); + registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_NO_LIMIT, l -> noLimit()); + + registerCustomRDFFluxStopOperator('&', RDFFluxStop::and); + registerCustomRDFFluxStopOperator('|', RDFFluxStop::or); + } + + /** + * register a custom flux stop option for the {@link #readConfig(String)} method + * + * @param name name of the option + * @param builder builder + */ + public static void registerCustomRDFFluxStopConfig(String name, LongFunction builder) { + name.chars().forEach(c -> { + if (!Character.isJavaIdentifierPart(c)) { + throw new IllegalArgumentException("Config can't contain non identifier part! Found '" + c + "'"); + } + }); + BUILDER.put(name, builder); + } + + /** + * register a custom flux stop operator for the {@link #readConfig(String)} method + * + * @param operator operator character + * @param builder builder + */ + public static void registerCustomRDFFluxStopOperator(char operator, BiFunction builder) { + if (Character.isJavaIdentifierPart(operator) || operator == '(' || operator == ')') { + throw new IllegalArgumentException("Operator can't be an identifier part or a parenthesis! Found '" + operator + "'"); + } + BUILDER_OP.put(operator, builder); + } + + private static int searchNextParenthesis(String cfg, int start) { + int deep = 0; + for (int i = start; i < cfg.length(); i++) { + switch (cfg.charAt(i)) { + case '(': + deep++; + break; + case ')': + if (deep == 0) { + return i; + } + deep--; + } + } + + throw new IllegalArgumentException("Can't find next parenthesis for start " + start); + } + + /** + * read a config to a flux stop, grammar: + * + *

FluxStop: limiter:number | ( FluxStop ) | Operator | (empty)

+ * + *

Operator: ( FluxStop ) op ( FluxStop )

+ * + *

You can register limiter with the {@link #registerCustomRDFFluxStopConfig(String, LongFunction)} method

+ * + *

You can register op with the {@link #registerCustomRDFFluxStopOperator(char, BiFunction)} method

+ * + * @param cfg config string + * @param start start in the config string + * @param end end in the config string + * @return RDFFluxStop or null if no RDFFluxStop is present + * @see #readConfig(String) + */ + public static RDFFluxStop readConfig(String cfg, int start, int end) { + if (cfg == null) { + return null; + } + int i = start; + // current element for boolean operators + RDFFluxStop element = null; + while (i < end) { + char c = cfg.charAt(i++); + + if (c == '(') { // start of block + if (element != null) { + throw new IllegalArgumentException("Find an element after another one without having an operator! " + (i - 1)); + } + int next = searchNextParenthesis(cfg, i); + element = readConfig(cfg, i, next); + i = next + 1; + + } else if (c == ')') { // end of block, should be handled here + throw new IllegalArgumentException("Find closing parenthesis without opening! " + (i - 1)); + } else if (Character.isJavaIdentifierPart(c)) { // start of function + + // read key + int startElement = i - 1; + int j = i; + while (j < end) { + if (!Character.isJavaIdentifierPart(cfg.charAt(j))) { + break; + } + j++; + } + + if (j == end || cfg.charAt(j) != ':') { // no value for key + throw new IllegalArgumentException("Identifier without value: " + startElement); + } + + String key = cfg.substring(startElement, j); + + LongFunction builder = BUILDER.get(key); + + if (builder == null) { // key isn't a right config + throw new IllegalArgumentException("Can't find option: " + key); + } + + // read value + + startElement = j + 1; + if (startElement == end || !Character.isDigit(cfg.charAt(startElement))) { // not a number value + throw new IllegalArgumentException("Identifier without number value: " + key + ", " + startElement); + } + + j = startElement; + while (j < end) { + if (!Character.isDigit(cfg.charAt(j))) { + break; + } + j++; + } + long value = Long.parseLong(cfg.substring(startElement, j)); + + element = builder.apply(value); + i = j; + } else { + // read operator or throw error + BiFunction opFunc = BUILDER_OP.get(c); + + if (opFunc == null) { + throw new IllegalArgumentException("Unknow component: " + c + ", " + (i - 1)); + } + + if (element == null) { + throw new IllegalArgumentException("Find operator without element before! " + (i - 1)); + } + return opFunc.apply(element, readConfig(cfg, i, end)); + } + } + + return element; + } + + /** + * read a config to a flux stop, see {@link #readConfig(String, int, int)} for grammar + * + * @param cfg config string + * @return RDFFluxStop or null if no RDFFluxStop is present + * @see #readConfig(String, int, int) + */ + public static RDFFluxStop readConfig(String cfg) { + return cfg == null ? null : readConfig(cfg, 0, cfg.length()); + } + + /** + * @return basic implementation without any limit + */ + public static RDFFluxStop noLimit() { + return new RDFFluxStop() { + @Override + public boolean canHandle(TripleString ts) { + return true; + } + + @Override + public void restart() { + // nothing + } + + @Override + public String asConfig() { + return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_NO_LIMIT + ":0"; + } + }; + } + + /** + * implementation of flux stop stopping after a maximum triple count + * + * @param maxTriple maximum count + * @return FluxStop + */ + public static RDFFluxStop countLimit(long maxTriple) { + if (maxTriple <= 0) { + throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); + } + return new RDFFluxStop() { + long current = 0; + + @Override + public boolean canHandle(TripleString ts) { + return current++ < maxTriple; + } + + @Override + public void restart() { + current = 0; + } + + @Override + public String asConfig() { + return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":" + maxTriple; + } + }; + } + + /** + * implementation of flux stop stopping after a maximum NTriple size + * + * @param maxSize maximum size + * @return FluxStop + */ + public static RDFFluxStop sizeLimit(long maxSize) { + if (maxSize <= 0) { + throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); + } + return new RDFFluxStop() { + long size = 0; + + @Override + public boolean canHandle(TripleString ts) { + long tsSize; + try { + tsSize = ts.asNtriple().toString().getBytes(StandardCharsets.UTF_8).length; + } catch (IOException e) { + throw new RuntimeException("Can't estimate the size of the triple " + ts, e); + } + try { + return size < maxSize; + } finally { + size += tsSize; + } + } + + @Override + public void restart() { + size = 0; + } + + @Override + public String asConfig() { + return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":" + maxSize; + } + }; + } + + /** + * should we stop the flux after this triple or not? + * + * @param ts the triple + * @return true if the flux can handle this triple, false otherwise + */ + public abstract boolean canHandle(TripleString ts); + + /** + * restart the flux stop + */ + public abstract void restart(); + + /** + * @return config value for the {@link org.rdfhdt.hdt.options.HDTOptionsKeys#RDF_FLUX_STOP_KEY} option + */ + public abstract String asConfig(); + + @Override + public String toString() { + return asConfig(); + } + + @Override + public boolean equals(Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof RDFFluxStop)) { + return false; + } + RDFFluxStop fluxStop = (RDFFluxStop) obj; + + return asConfig().equals(fluxStop.asConfig()); + } + + /** + * combine 2 rdf flux stop with a boolean operation, return this if fluxStop == null + * + * @param fluxStop the other flux stop + * @param stringOperator operator for the {@link #asConfig()} version + * @param operator the operator + * @return rdffluxstop + * @see #and(RDFFluxStop) + * @see #or(RDFFluxStop) + */ + public RDFFluxStop booleanOp(RDFFluxStop fluxStop, String stringOperator, BinaryOperator operator) { + if (fluxStop == null) { + return this; + } + return new RDFFluxStop() { + @Override + public boolean canHandle(TripleString ts) { + boolean left = RDFFluxStop.this.canHandle(ts); + boolean right = fluxStop.canHandle(ts); + return operator.apply(left, right); + } + + @Override + public void restart() { + RDFFluxStop.this.restart(); + fluxStop.restart(); + } + + @Override + public String asConfig() { + String left = RDFFluxStop.this.asConfig(); + String right = fluxStop.asConfig(); + return "(" + left + ")" + stringOperator + "(" + right + ")"; + } + }; + } + + /** + * {@link #booleanOp(RDFFluxStop, String, BinaryOperator)} version for AND + * + * @param fluxStop other flux stop + * @return rdffluxstop + */ + public RDFFluxStop and(RDFFluxStop fluxStop) { + return booleanOp(fluxStop, "&", (a, b) -> a && b); + } + + /** + * {@link #booleanOp(RDFFluxStop, String, BinaryOperator)} version for OR + * + * @param fluxStop other flux stop + * @return rdffluxstop + */ + public RDFFluxStop or(RDFFluxStop fluxStop) { + return booleanOp(fluxStop, "|", (a, b) -> a || b); + } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index 89ed7d63..379474b9 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -27,6 +27,7 @@ package org.rdfhdt.hdt.tools; import java.io.IOException; +import java.net.URI; import java.nio.file.Path; import java.util.List; @@ -131,7 +132,15 @@ public void execute() throws ParserException, IOException { spec.setOptions(options); } if (baseURI == null) { - baseURI = "file://" + rdfInput; + String input = rdfInput.toLowerCase(); + if (input.startsWith("http") || input.startsWith("ftp")) { + baseURI = URI.create(rdfInput).toString(); + } else { + baseURI = Path.of(rdfInput).toUri().toString(); + } + if (!quiet) { + System.out.println("base uri not specified, using '" + baseURI + "'"); + } } RDFNotation notation = null; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java index 799aaca4..a8194b19 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java @@ -150,7 +150,7 @@ public void save(OutputStream output, ProgressListener listener) throws IOExcept out.writeCRC(); // write the storage file, already contains the CRC - Files.copy(file.getJavaPath(), output); + Files.copy(file, output); // delete the file file.close(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java index a4b197ef..f8dcdd72 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java @@ -115,7 +115,7 @@ public void save(OutputStream output, ProgressListener listener) throws IOExcept // Write blocks directly to output, they have their own CRC check. blocks.save(output, listener); // Write blocks data directly to output, the load was writing using a CRC check. - Files.copy(tempFilename.getJavaPath(), output); + Files.copy(tempFilename, output); } @Override diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 30598b6b..24b037dd 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -28,6 +28,7 @@ import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.options.HideHDTOptions; import org.rdfhdt.hdt.rdf.RDFFluxStop; import org.rdfhdt.hdt.rdf.RDFParserCallback; import org.rdfhdt.hdt.rdf.RDFParserFactory; @@ -55,10 +56,7 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Optional; +import java.util.*; public class HDTManagerImpl extends HDTManager { private static final Logger logger = LoggerFactory.getLogger(HDTManagerImpl.class); @@ -126,15 +124,55 @@ public HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException { return hdt; } + private RDFFluxStop readFluxStopOrSizeLimit(HDTOptions spec) { + // if no config, use default implementation + return Objects.requireNonNullElseGet( + RDFFluxStop.readConfig(spec.get(HDTOptionsKeys.RDF_FLUX_STOP_KEY)), + () -> { + // get the chunk size to base the work + String loaderType = spec.get(HDTOptionsKeys.LOADER_CATTREE_LOADERTYPE_KEY); + + if (!HDTOptionsKeys.LOADER_TYPE_VALUE_DISK.equals(loaderType)) { + // memory based implementation, we can only store the NT file + return RDFFluxStop.sizeLimit(getMaxChunkSize()); + } + + // disk based implementation, we only have to reduce the fault-factor of the map files + long chunkSize = findBestMemoryChunkDiskMapTreeCat(); + + String factorOpt = spec.get(HDTOptionsKeys.LOADER_CATTREE_MEMORY_FAULT_FACTOR); + double factor; + + if (factorOpt == null || factorOpt.isEmpty()) { + // default value + factor = 1.4; + } else { + factor = Double.parseDouble(factorOpt); + + if (factor <= 0) { + throw new IllegalArgumentException(HDTOptionsKeys.LOADER_CATTREE_MEMORY_FAULT_FACTOR + " can't have a negative or 0 value!"); + } + } + + // create a count limit from the chunk size / factor, set a minimum value for low factor + return RDFFluxStop.countLimit(Math.max(128, (long) (chunkSize * factor))); + } + ); + } + @Override public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions spec, ProgressListener listener) throws IOException, ParserException { //choose the importer String loaderType = spec.get(HDTOptionsKeys.LOADER_TYPE_KEY); TempHDTImporter loader; - if (HDTOptionsKeys.LOADER_TYPE_VALUE_TWO_PASS.equals(loaderType)) { + if (HDTOptionsKeys.LOADER_TYPE_VALUE_DISK.equals(loaderType)) { + return doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, CompressionType.guess(rdfFileName), spec, listener); + } else if (HDTOptionsKeys.LOADER_TYPE_VALUE_CAT.equals(loaderType)) { + return doHDTCatTree(readFluxStopOrSizeLimit(spec), HDTSupplier.fromSpec(spec), rdfFileName, baseURI, rdfNotation, spec, listener); + } else if (HDTOptionsKeys.LOADER_TYPE_VALUE_TWO_PASS.equals(loaderType)) { loader = new TempHDTImporterTwoPass(useSimple(spec)); } else { - if (!HDTOptionsKeys.LOADER_TYPE_VALUE_ONE_PASS.equals(loaderType)) { + if (loaderType != null && !HDTOptionsKeys.LOADER_TYPE_VALUE_ONE_PASS.equals(loaderType)) { logger.warn("Used the option {} with value {}, which isn't recognize, using default value {}", HDTOptionsKeys.LOADER_TYPE_KEY, loaderType, HDTOptionsKeys.LOADER_TYPE_VALUE_ONE_PASS); } @@ -168,15 +206,40 @@ public HDT doGenerateHDT(InputStream fileStream, String baseURI, RDFNotation rdf // create a parser for this rdf stream RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation); // read the stream as triples - Iterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation); - - return doGenerateHDT(iterator, baseURI, hdtFormat, listener); + try (PipedCopyIterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation)) { + return doGenerateHDT(iterator, baseURI, hdtFormat, listener); + } } @Override public HDT doGenerateHDT(Iterator triples, String baseURI, HDTOptions spec, ProgressListener listener) throws IOException { //choose the importer - TempHDTImporterOnePass loader = new TempHDTImporterOnePass(false); + String loaderType = spec.get(HDTOptionsKeys.LOADER_TYPE_KEY); + TempHDTImporterOnePass loader; + if (HDTOptionsKeys.LOADER_TYPE_VALUE_DISK.equals(loaderType)) { + try { + return doGenerateHDTDisk(triples, baseURI, spec, listener); + } catch (ParserException e) { + throw new RuntimeException(e); + } + } else if (HDTOptionsKeys.LOADER_TYPE_VALUE_CAT.equals(loaderType)) { + try { + return doHDTCatTree(readFluxStopOrSizeLimit(spec), HDTSupplier.fromSpec(spec), triples, baseURI, spec, listener); + } catch (ParserException e) { + throw new RuntimeException(e); + } + } else { + if (loaderType != null) { + if (HDTOptionsKeys.LOADER_TYPE_VALUE_TWO_PASS.equals(loaderType)) { + logger.warn("Used the option {} with value {}, which isn't available for stream generation, using default value {}", + HDTOptionsKeys.LOADER_TYPE_KEY, loaderType, HDTOptionsKeys.LOADER_TYPE_VALUE_ONE_PASS); + } else if (!HDTOptionsKeys.LOADER_TYPE_VALUE_ONE_PASS.equals(loaderType)) { + logger.warn("Used the option {} with value {}, which isn't recognize, using default value {}", + HDTOptionsKeys.LOADER_TYPE_KEY, loaderType, HDTOptionsKeys.LOADER_TYPE_VALUE_ONE_PASS); + } + } + loader = new TempHDTImporterOnePass(useSimple(spec)); + } // Create TempHDT try (TempHDT modHdt = loader.loadFromTriples(spec, triples, baseURI, listener)) { @@ -212,9 +275,9 @@ public HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation // create a parser for this rdf stream RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation, useSimple(hdtFormat)); // read the stream as triples - Iterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation); - - return doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener); + try (PipedCopyIterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation)) { + return doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener); + } } /** @@ -224,6 +287,28 @@ static long getMaxChunkSize(int workers) { Runtime runtime = Runtime.getRuntime(); return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85 / (1.5 * 3 * workers)); } + /** + * @return a theoretical maximum amount of memory the JVM will attempt to use + */ + static long getMaxChunkSize() { + Runtime runtime = Runtime.getRuntime(); + return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85); + } + + private static long findBestMemoryChunkDiskMapTreeCat() { + Runtime runtime = Runtime.getRuntime(); + long maxRam = (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85) / 3; + + int shift = 0; + + while (shift != 63 && (1L << shift) * BitUtil.log2(1L << shift) < maxRam) { + shift++; + } + + // it will take at most "shift" bits per triple + // we divide by 3 for the 3 maps + return maxRam / shift; + } @Override public HDT doGenerateHDTDisk(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener progressListener) throws IOException, ParserException { @@ -247,9 +332,7 @@ public HDT doGenerateHDTDisk(Iterator iterator, String baseURI, HD // location of the future HDT file, do not set to create the HDT in memory while mergin String futureHDTLocation = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY); - Profiler profiler = new Profiler("doGenerateHDTDisk"); - String profilerString = hdtFormat.get(HDTOptionsKeys.PROFILER_KEY); - profiler.setDisabled(profilerString == null || !profilerString.equalsIgnoreCase("true")); + Profiler profiler = new Profiler("doGenerateHDTDisk", hdtFormat); // check and set default values if required if (workers == 0) { workers = Runtime.getRuntime().availableProcessors(); @@ -413,10 +496,13 @@ public HDT doGenerateHDTDisk(Iterator iterator, String baseURI, HD return hdt; } } finally { - profiler.stop(); - profiler.writeProfiling(); - listener.notifyProgress(100, "Clearing disk"); - basePath.close(); + try { + profiler.stop(); + profiler.writeProfiling(); + listener.notifyProgress(100, "Clearing disk"); + } finally { + basePath.close(); + } } } @@ -475,8 +561,9 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, String fi @Override protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream stream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation, useSimple(hdtFormat)); - Iterator iterator = RDFParserFactory.readAsIterator(parser, stream, baseURI, true, rdfNotation); - return doHDTCatTree(fluxStop, supplier, iterator, baseURI, hdtFormat, listener); + try (PipedCopyIterator iterator = RDFParserFactory.readAsIterator(parser, stream, baseURI, true, rdfNotation)) { + return doHDTCatTree(fluxStop, supplier, iterator, baseURI, hdtFormat, listener); + } } @Override @@ -490,11 +577,12 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< basePath = Path.of(baseNameOpt); } + // hide the loader type to avoid infinite recursion + hdtFormat = new HideHDTOptions(hdtFormat, key -> HDTOptionsKeys.LOADER_TYPE_KEY.equals(key) ? HDTOptionsKeys.LOADER_CATTREE_LOADERTYPE_KEY : key); + Path futureHDTLocation = Optional.ofNullable(hdtFormat.get(HDTOptionsKeys.LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY)).map(Path::of).orElse(null); - Profiler profiler = new Profiler("doHDTCatTree"); - String profilerString = hdtFormat.get(HDTOptionsKeys.PROFILER_KEY); - profiler.setDisabled(profilerString == null || !profilerString.equalsIgnoreCase("true")); + Profiler profiler = new Profiler("doHDTCatTree", hdtFormat); FluxStopTripleStringIterator it = new FluxStopTripleStringIterator(iterator, fluxStop); @@ -555,6 +643,7 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< // if a future HDT location has been asked, move to it and map the HDT if (futureHDTLocation != null) { + Files.createDirectories(futureHDTLocation.getParent()); Files.deleteIfExists(futureHDTLocation); Files.move(hdtFile, futureHDTLocation); return HDTManager.mapHDT(futureHDTLocation.toAbsolutePath().toString()); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java index 51b511d9..b9dcaba4 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java @@ -1,6 +1,7 @@ package org.rdfhdt.hdt.hdt.impl.diskimport; import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; +import org.rdfhdt.hdt.iterator.utils.IndexNodeDeltaMergeExceptionIterator; import org.rdfhdt.hdt.iterator.utils.SizeFetcher; import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.triples.IndexedNode; @@ -441,7 +442,7 @@ private void computeSection(List triples, String section, int start, // section try (OutputStream output = openW.get()) { - CompressUtil.writeCompressedSection(CompressNodeMergeIterator.buildOfTree(readers), size, output, il); + CompressUtil.writeCompressedSection(IndexNodeDeltaMergeExceptionIterator.buildOfTree(readers), size, output, il); } } finally { if (async) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIterator.java new file mode 100644 index 00000000..da929298 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIterator.java @@ -0,0 +1,363 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.triples.IndexedNode; + +import java.util.Arrays; +import java.util.List; +import java.util.function.Function; + +/** + * Merge iterator to merge {@link org.rdfhdt.hdt.triples.IndexedNode} with delta to reduce the compare count + * + * @param fetcher exception type + * @author Antoine Willerval + */ +public abstract class IndexNodeDeltaMergeExceptionIterator implements ExceptionIterator { + /** + * Create a tree of merge iterators from an array of element + * + * @param itFunction a function to create an iterator from an element + * @param array the elements + * @param length the number of elements + * @param input of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, I[] array, int length) { + return buildOfTree(itFunction, array, 0, length); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param itFunction a function to create an iterator from an element + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, I[] array, int start, int end) { + return buildOfTree(itFunction, Arrays.asList(array), start, end); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param itFunction a function to create an iterator from an element + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, List array, int start, int end) { + return buildOfTree0(itFunction, array, start, end); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param array the elements + * @param length the number of elements + * @param input of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static , E extends Exception> ExceptionIterator buildOfTree( + I[] array, int length) { + return buildOfTree(i -> i, array, 0, length); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param array the elements + * @param input of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static , E extends Exception> ExceptionIterator buildOfTree( + I[] array) { + return buildOfTree(i -> i, array, 0, array.length); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param exception returned by the iterator + * @return the iterator + */ + public static , E extends Exception> ExceptionIterator buildOfTree( + I[] array, int start, int end) { + return buildOfTree(i -> i, Arrays.asList(array), start, end); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param exception returned by the iterator + * @return the iterator + */ + public static , E extends Exception> ExceptionIterator buildOfTree( + List array, int start, int end) { + return buildOfTree(i -> i, array, start, end); + } + + private static IndexNodeDeltaMergeExceptionIterator buildOfTree0( + Function> itFunction, List array, int start, int end) { + int length = end - start; + if (length <= 0) { + return new ExceptionIteratorEmpty<>(); + } + if (length == 1) { + return new ExceptionIteratorMap<>(itFunction.apply(array.get(start))); + } + int mid = (start + end) / 2; + return new ExceptionIteratorMerger<>( + buildOfTree0(itFunction, array, start, mid), + buildOfTree0(itFunction, array, mid, end) + ); + } + + /** + * Compare 2 nodes from a starting delta + * + * @param delta the start delta + * @param node1 the 1st node + * @param node2 the 2nd node + * @return delta+1 if node1 <= node2 or -delta-1 otherwise + */ + static int compareToDelta(int delta, IndexedNode node1, IndexedNode node2) { + CharSequence cs1 = node1.getNode(); + CharSequence cs2 = node2.getNode(); + + int len = Math.min(cs1.length(), cs2.length()); + + for (int i = delta; i < len; i++) { + char a = cs1.charAt(i); + char b = cs2.charAt(i); + if (a != b) { + return a > b ? -(i + 1) : (i + 1); + } + } + + return cs1.length() != len ? -(len + 1) : (len + 1); + } + + protected IndexedNode next; + protected int delta; + protected int pivot; + + private IndexNodeDeltaMergeExceptionIterator() { + } + + @Override + public boolean hasNext() throws E { + if (next != null) { + return true; + } + + fetchNext(); + + return peekNext() != null; + } + + @Override + public IndexedNode next() throws E { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } + + /** + * @return get without switching to the next element of the iterator, should be called after a {@link #fetchNext()} + */ + public IndexedNode peekNext() { + return next; + } + + /** + * @return the delta of the last next element, can be after {@link #next()} or after {@link #fetchNext()} + */ + public int getDelta() { + return delta; + } + + /** + * fetch the next element, will update the {@link #peekNext()} return value and the {@link #getDelta()} return value + * + * @throws E fetch exception + */ + public abstract void fetchNext() throws E; + + public void printMergeTree() { + printMergeTree(0); + } + + protected abstract void printMergeTree(int depth); + + /** + * Implementation of a fetcher to get a node with its delta + * + * @param fetch exception + */ + public interface IndexNodeDeltaFetcher { + /** + * @return the next node + * @throws E fetch exception + */ + IndexedNode fetchNode() throws E; + + /** + * @return the delta of the last next node with the previous last-last next node + * @throws E fetch exception + */ + int lastDelta() throws E; + } + + static class ExceptionIteratorMap extends IndexNodeDeltaMergeExceptionIterator { + private final IndexNodeDeltaFetcher iterator; + + public ExceptionIteratorMap(IndexNodeDeltaFetcher iterator) { + this.iterator = iterator; + } + + @Override + public void fetchNext() throws E { + next = iterator.fetchNode(); + delta = iterator.lastDelta(); + } + + @Override + protected void printMergeTree(int depth) { + System.out.println(" ".repeat(depth) + "Leaf[" + iterator + "]"); + } + } + + static class ExceptionIteratorEmpty extends IndexNodeDeltaMergeExceptionIterator { + @Override + public void fetchNext() { + } + + @Override + protected void printMergeTree(int depth) { + System.out.println(" ".repeat(depth) + "Empty"); + } + } + + static class ExceptionIteratorMerger extends IndexNodeDeltaMergeExceptionIterator { + private final IndexNodeDeltaMergeExceptionIterator it1; + private final IndexNodeDeltaMergeExceptionIterator it2; + private IndexedNode last1; + private IndexedNode last2; + private boolean send1; + + public ExceptionIteratorMerger(IndexNodeDeltaMergeExceptionIterator it1, IndexNodeDeltaMergeExceptionIterator it2) { + this.it1 = it1; + this.it2 = it2; + } + + @Override + public void fetchNext() throws E { + if (last1 == null) { + it1.fetchNext(); + last1 = it1.peekNext(); + } + if (last2 == null) { + it2.fetchNext(); + last2 = it2.peekNext(); + } + + // stop fetcher + if (last1 == null || last2 == null) { + if (last1 != null) { + // send last1 if no last1 + next = last1; + last1 = null; + } else if (last2 != null) { + // send last2 if no last1 + next = last2; + last2 = null; + } else { + next = null; + } + // else: stop iteration + return; + } + + // 2 nodes to compare + + int delta1 = it1.getDelta(); + int delta2 = it2.getDelta(); + + // minimum start compare + int minDelta = Math.min(delta1, delta2); + + if (pivot < minDelta) { + // no need to check the values, delta is higher than the diff, resend the same value + if (send1) { + next = last1; + last1 = null; + } else { + next = last2; + last2 = null; + } + + return; + } + + // we need to compare from at least minDelta chars + int deltaCompare = compareToDelta(minDelta, last1, last2); + + if (deltaCompare < 0) { + // node1 > node2 -> send node2 + next = last2; + if (!send1) { + // the last send was the send1, we can send the real delta + delta = delta2; + } else { + // not the same, we need to compare to get the new delta + delta = Math.min(delta2, delta); + } + pivot = -deltaCompare - 1; + last2 = null; + send1 = false; + } else { + // node1 < node2 -> send node1 + next = last1; + if (send1) { + // the last send was the send2, we can send the real delta + delta = delta1; + } else { + // not the same, we need to compare to get the new delta + delta = Math.min(delta1, delta); + } + pivot = deltaCompare - 1; + last1 = null; + send1 = true; + } + } + + @Override + protected void printMergeTree(int depth) { + System.out.println(" ".repeat(depth) + "Merge"); + it1.printMergeTree(depth + 1); + it2.printMergeTree(depth + 1); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java index 904ffd44..85b6ca8a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java @@ -1,7 +1,10 @@ package org.rdfhdt.hdt.iterator.utils; -import java.util.concurrent.ArrayBlockingQueue; +import java.io.Closeable; +import java.io.IOException; import java.util.Iterator; +import java.util.Objects; +import java.util.concurrent.ArrayBlockingQueue; import java.util.function.Function; /** @@ -11,182 +14,213 @@ * @author Antoine Willerval */ -public class PipedCopyIterator implements Iterator { - - /** - * RuntimeException generated by the PipedCopyIterator - * - * @author Antoine Willerval - */ - public static class PipedIteratorException extends RuntimeException { - public PipedIteratorException(String message, Throwable t) { - super(message, t); - } - } - - - /** - * Callback for the {@link #createOfCallback(PipedCopyIterator.PipeCallBack)} method - * - * @param the iterator type - * @author Antoine Willerval - */ - @FunctionalInterface - public interface PipeCallBack { - /** - * method called from the new thread to generate the new data, at the end of the callback, the pipe is closed - * with or without exception - * - * @param pipe the pipe to fill - * @throws Exception any exception returned by the generator - */ - void createPipe(PipedCopyIterator pipe) throws Exception; - } - - /** - * create a piped iterator from a callback runner, the call to the callback should be made in the callbackRunner - * - * @param callbackRunner the callback runner - * @param type of the iterator - * @return the iterator - */ - public static PipedCopyIterator createOfCallback(PipeCallBack callbackRunner) { - PipedCopyIterator pipe = new PipedCopyIterator<>(); - - Thread thread = new Thread(() -> { - try { - callbackRunner.createPipe(pipe); - pipe.closePipe(); - } catch (Throwable e) { - pipe.closePipe(e); - } - }, "PipeIterator"); - thread.start(); - - return pipe; - } - - private interface QueueObject { - boolean end(); - - T get(); - } - - private class ElementQueueObject implements QueueObject { - private final T obj; - - private ElementQueueObject(T obj) { - this.obj = obj; - } - - - @Override - public boolean end() { - return false; - } - - @Override - public T get() { - return obj; - } - } - private class EndQueueObject implements QueueObject { - @Override - public boolean end() { - return true; - } - - @Override - public T get() { - throw new IllegalArgumentException(); - } - } - - private final ArrayBlockingQueue> queue = new ArrayBlockingQueue<>(16); - - private T next; - private boolean end; - private PipedIteratorException exception; - - @Override - public boolean hasNext() { - if (end) { - return false; - } - if (next != null) { - return true; - } - - QueueObject obj; - try { - obj = queue.take(); - } catch (InterruptedException e) { - throw new PipedIteratorException("Can't read pipe", e); - } - - if (obj.end()) { - end = true; - if (exception != null) { - throw exception; - } - return false; - } - next = obj.get(); - return true; - } - - @Override - public T next() { - if (!hasNext()) { - return null; - } - T next = this.next; - this.next = null; - return next; - } - - public void closePipe() { - closePipe(null); - } - public void closePipe(Throwable e) { - if (e != null) { - if (e instanceof PipedIteratorException) { - this.exception = (PipedIteratorException) e; - } else { - this.exception = new PipedIteratorException("closing exception", e); - } - } - try { - queue.put(new EndQueueObject()); - } catch (InterruptedException ee) { - throw new PipedIteratorException("Can't close pipe", ee); - } - } - - /** - * map this iterator to another type - * @param mappingFunction the mapping function - * @param the future type - * @return mapped iterator - */ - public Iterator map(Function mappingFunction) { - return new MapIterator<>(this, mappingFunction); - } - /** - * map this iterator to another type - * @param mappingFunction the mapping function - * @param the future type - * @return mapped iterator - */ - public Iterator mapWithId(MapIterator.MapWithIdFunction mappingFunction) { - return new MapIterator<>(this, mappingFunction); - } - - public void addElement(T node) { - try { - queue.put(new ElementQueueObject(node)); - } catch (InterruptedException ee) { - throw new PipedIteratorException("Can't add element to pipe", ee); +public class PipedCopyIterator implements Iterator, Closeable { + + + /** + * RuntimeException generated by the PipedCopyIterator + * + * @author Antoine Willerval + */ + public static class PipedIteratorException extends RuntimeException { + public PipedIteratorException(String message, Throwable t) { + super(message, t); + } + } + + + /** + * Callback for the {@link #createOfCallback(PipedCopyIterator.PipeCallBack)} method + * + * @param the iterator type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface PipeCallBack { + /** + * method called from the new thread to generate the new data, at the end of the callback, the pipe is closed + * with or without exception + * + * @param pipe the pipe to fill + * @throws Exception any exception returned by the generator + */ + void createPipe(PipedCopyIterator pipe) throws Exception; + } + + /** + * create a piped iterator from a callback runner, the call to the callback should be made in the callbackRunner + * + * @param callbackRunner the callback runner + * @param type of the iterator + * @return the iterator + */ + public static PipedCopyIterator createOfCallback(PipeCallBack callbackRunner) { + PipedCopyIterator pipe = new PipedCopyIterator<>(); + + Thread thread = new Thread(() -> { + try { + callbackRunner.createPipe(pipe); + pipe.closePipe(); + } catch (Throwable e) { + pipe.closePipe(e); + } + }, "PipeIterator"); + thread.start(); + + // close the thread at end + pipe.attachThread(thread); + + return pipe; + } + + private interface QueueObject { + boolean end(); + + T get(); + } + + private class ElementQueueObject implements QueueObject { + private final T obj; + + private ElementQueueObject(T obj) { + this.obj = obj; + } + + + @Override + public boolean end() { + return false; + } + + @Override + public T get() { + return obj; + } + } + + private class EndQueueObject implements QueueObject { + @Override + public boolean end() { + return true; + } + + @Override + public T get() { + throw new IllegalArgumentException(); + } + } + + private final ArrayBlockingQueue> queue = new ArrayBlockingQueue<>(16); + + private T next; + private boolean end; + private PipedIteratorException exception; + + private Thread thread; + + @Override + public boolean hasNext() { + if (end) { + return false; + } + if (next != null) { + return true; + } + + QueueObject obj; + try { + obj = queue.take(); + } catch (InterruptedException e) { + throw new PipedIteratorException("Can't read pipe", e); + } + + if (obj.end()) { + end = true; + if (exception != null) { + throw exception; + } + return false; + } + next = obj.get(); + return true; + } + + @Override + public T next() { + if (!hasNext()) { + return null; + } + T next = this.next; + this.next = null; + return next; + } + + public void closePipe() { + closePipe(null); + } + + public void closePipe(Throwable e) { + if (e != null) { + if (e instanceof PipedIteratorException) { + this.exception = (PipedIteratorException) e; + } else { + this.exception = new PipedIteratorException("closing exception", e); + } + } + try { + queue.put(new EndQueueObject()); + } catch (InterruptedException ee) { + throw new PipedIteratorException("Can't close pipe", ee); + } + } + + /** + * map this iterator to another type + * + * @param mappingFunction the mapping function + * @param the future type + * @return mapped iterator + */ + public Iterator map(Function mappingFunction) { + return new MapIterator<>(this, mappingFunction); + } + + /** + * map this iterator to another type + * + * @param mappingFunction the mapping function + * @param the future type + * @return mapped iterator + */ + public Iterator mapWithId(MapIterator.MapWithIdFunction mappingFunction) { + return new MapIterator<>(this, mappingFunction); + } + + public void addElement(T node) { + try { + queue.put(new ElementQueueObject(node)); + } catch (InterruptedException ee) { + throw new PipedIteratorException("Can't add element to pipe", ee); + } + } + + /** + * attach a thread to interrupt with this iterator + * + * @param thread the thread + */ + public void attachThread(Thread thread) { + Objects.requireNonNull(thread, "thread can't be null!"); + if (this.thread != null && this.thread != thread) { + throw new IllegalArgumentException("Thread already attached"); + } + this.thread = thread; + } + + @Override + public void close() throws IOException { + if (thread != null) { + thread.interrupt(); } - } + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HideHDTOptions.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HideHDTOptions.java new file mode 100644 index 00000000..dc557c15 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HideHDTOptions.java @@ -0,0 +1,57 @@ +package org.rdfhdt.hdt.options; + +import java.util.Objects; +import java.util.function.Function; + +/** + * {@link HDTOptions} wrapper to redirect a key to another key + * + * @author Antoine Willerval + */ +public class HideHDTOptions implements HDTOptions { + private final HDTOptions spec; + private final Function mapper; + + /** + * @param spec wrapped options + * @param mapper mapping function (key) -> newKey? + */ + public HideHDTOptions(HDTOptions spec, Function mapper) { + this.spec = spec; + this.mapper = mapper; + } + + private String map(String key) { + return Objects.requireNonNullElse(mapper.apply(key), ""); + } + + @Override + public String get(String key) { + return spec.get(map(key)); + } + + @Override + public void set(String key, String value) { + spec.set(map(key), value); + } + + @Override + public void setOptions(String options) { + spec.setOptions(options); + } + + @Override + public long getInt(String key) { + return spec.getInt(map(key)); + } + + @Override + public void setInt(String key, long value) { + spec.setInt(map(key), value); + } + + @Override + public void clear() { + spec.clear(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java index 59da86f4..394ffcfe 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java @@ -4,6 +4,7 @@ import java.util.Comparator; + public class IndexedNode implements Comparable { private static final Comparator NODE_COMPARATOR = CharSequenceComparator.getInstance(); private CharSequence node; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java index 686b21c7..3507004b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java @@ -1,28 +1,90 @@ package org.rdfhdt.hdt.util; +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRCInputStream; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Objects; /** * tool to profile time + * * @author Antoine Willerval */ public class Profiler { + /** + * Read the profiling values from an input path + * + * @param inputPath input path + * @throws java.io.IOException reading exception + * @throws java.lang.IllegalArgumentException if the file's CRC doesn't match + */ + public static Profiler readFromDisk(Path inputPath) throws IOException { + Profiler p = new Profiler(""); + try (CRCInputStream is = new CRCInputStream(new BufferedInputStream(Files.newInputStream(inputPath)), new CRC32())) { + for (byte b : HEADER) { + if (is.read() != b) { + throw new IOException("Missing header for the profiling file!"); + } + } + p.mainSection = p.new Section(is); + if (!is.readCRCAndCheck()) { + throw new IllegalArgumentException("CRC doesn't match when reading the CRC!"); + } + } + return p; + } + + private static final byte[] HEADER = {'H', 'D', 'T', 'P', 'R', 'O', 'F', 'I', 'L', 'E'}; private int maxSize = 0; private final String name; private Section mainSection; private boolean disabled; + private Path outputPath; /** * create a profiler + * * @param name the profiler name */ public Profiler(String name) { - this.name = name; + this(name, null); + } + + /** + * create a profiler from specifications + * + * @param name profiler name + * @param spec spec (nullable) + */ + public Profiler(String name, HDTOptions spec) { + this.name = Objects.requireNonNull(name, "name can't be null!"); + if (spec != null) { + disabled = !"true".equalsIgnoreCase(spec.get(HDTOptionsKeys.PROFILER_KEY)); + String profilerOutputLocation = spec.get(HDTOptionsKeys.PROFILER_OUTPUT_KEY); + if (profilerOutputLocation != null && !profilerOutputLocation.isEmpty()) { + outputPath = Path.of(profilerOutputLocation); + } + } } /** * disable the profiler methods + * * @param disable if true, the methods will be callable, but won't do anything */ public void setDisabled(boolean disable) { @@ -31,6 +93,7 @@ public void setDisabled(boolean disable) { /** * start a section + * * @param name the section name */ public void pushSection(String name) { @@ -63,14 +126,39 @@ public void stop() { getMainSection().stop(); } + /** + * reset the profiler + */ + public void reset() { + mainSection = null; + } + /** * write the profile into the console */ - public void writeProfiling() { + public void writeProfiling() throws IOException { if (disabled) { return; } getMainSection().writeProfiling("", true); + if (outputPath != null) { + writeToDisk(outputPath); + } + } + + /** + * Write the profiling values into the output path + * + * @param outputPath output path + */ + public void writeToDisk(Path outputPath) throws IOException { + try (CRCOutputStream os = new CRCOutputStream(new BufferedOutputStream(Files.newOutputStream(outputPath)), new CRC32())) { + for (byte b : HEADER) { + os.write(b); + } + getMainSection().writeSection(os); + os.writeCRC(); + } } /** @@ -88,13 +176,54 @@ public Section getMainSection() { */ public class Section { private final String name; - private final long start = System.nanoTime(); - private long end = start; - private final List
subSections = new ArrayList<>(); - private Section currentSection; + private final long start; + private long end; + private final List
subSections; + private transient Section currentSection; Section(String name) { this.name = name; + start = System.nanoTime(); + end = start; + subSections = new ArrayList<>(); + } + + /** + * read the section from the input stream + * + * @param is input stream + * @throws IOException io exception + */ + Section(InputStream is) throws IOException { + start = VByte.decode(is); + end = VByte.decode(is); + + int nameLength = (int) VByte.decode(is); + byte[] nameBytes = IOUtil.readBuffer(is, nameLength, null); + name = new String(nameBytes, StandardCharsets.UTF_8); + + int subSize = (int) VByte.decode(is); + subSections = new ArrayList<>(subSize); + for (int i = 0; i < subSize; i++) { + subSections.add(new Section(is)); + } + } + + void writeSection(OutputStream os) throws IOException { + byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); + + VByte.encode(os, start); + VByte.encode(os, end); + + VByte.encode(os, nameBytes.length); + os.write(nameBytes); + + List
sub = getSubSections(); + VByte.encode(os, sub.size()); + + for (Section s : sub) { + s.writeSection(os); + } } /** @@ -137,6 +266,32 @@ boolean popSection() { } } + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + Section section = (Section) o; + + return start == section.start + && end == section.end + && name.equals(section.name) + && subSections.equals(section.subSections); + } + + @Override + public int hashCode() { + int result = name.hashCode(); + result = 31 * result + (int) (start ^ (start >>> 32)); + result = 31 * result + (int) (end ^ (end >>> 32)); + result = 31 * result + subSections.hashCode(); + return result; + } + void stop() { if (isRunning()) { currentSection.stop(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressFileProvider.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressFileProvider.java new file mode 100644 index 00000000..26a77132 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressFileProvider.java @@ -0,0 +1,171 @@ +package org.rdfhdt.hdt.util.io; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URI; +import java.nio.channels.AsynchronousFileChannel; +import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; +import java.nio.file.*; +import java.nio.file.attribute.BasicFileAttributes; +import java.nio.file.attribute.FileAttribute; +import java.nio.file.attribute.FileAttributeView; +import java.nio.file.spi.FileSystemProvider; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ExecutorService; + +import static org.rdfhdt.hdt.util.io.CloseSuppressPath.of; + +/** + * {@link FileSystemProvider} implementation for {@link CloseSuppressPath} + * + * @author Antoine Willerval + */ +public class CloseSuppressFileProvider extends FileSystemProvider { + private final FileSystemProvider provider; + + + public CloseSuppressFileProvider(FileSystemProvider provider) { + this.provider = provider; + } + + @Override + public String getScheme() { + return provider.getScheme(); + } + + @Override + public FileSystem newFileSystem(URI uri, Map env) throws IOException { + return provider.newFileSystem(uri, env); + } + + @Override + public FileSystem getFileSystem(URI uri) { + return provider.getFileSystem(uri); + } + + @Override + public Path getPath(URI uri) { + return of(provider.getPath(uri)); + } + + @Override + public FileSystem newFileSystem(Path path, Map env) throws IOException { + return provider.newFileSystem((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), env); + } + + @Override + public InputStream newInputStream(Path path, OpenOption... options) throws IOException { + return provider.newInputStream((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), options); + } + + @Override + public OutputStream newOutputStream(Path path, OpenOption... options) throws IOException { + return provider.newOutputStream((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), options); + } + + @Override + public FileChannel newFileChannel(Path path, Set options, FileAttribute... attrs) throws IOException { + return provider.newFileChannel((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), options, attrs); + } + + @Override + public AsynchronousFileChannel newAsynchronousFileChannel(Path path, Set options, ExecutorService executor, FileAttribute... attrs) throws IOException { + return provider.newAsynchronousFileChannel((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), options, executor, attrs); + } + + @Override + public SeekableByteChannel newByteChannel(Path path, Set options, FileAttribute... attrs) throws IOException { + return provider.newByteChannel((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), options, attrs); + } + + @Override + public DirectoryStream newDirectoryStream(Path dir, DirectoryStream.Filter filter) throws IOException { + return provider.newDirectoryStream((dir instanceof CloseSuppressPath ? ((CloseSuppressPath) dir).getJavaPath() : dir), filter); + } + + @Override + public void createDirectory(Path dir, FileAttribute... attrs) throws IOException { + provider.createDirectory((dir instanceof CloseSuppressPath ? ((CloseSuppressPath) dir).getJavaPath() : dir), attrs); + } + + @Override + public void createSymbolicLink(Path link, Path target, FileAttribute... attrs) throws IOException { + provider.createSymbolicLink((link instanceof CloseSuppressPath ? ((CloseSuppressPath) link).getJavaPath() : link), target, attrs); + } + + @Override + public void createLink(Path link, Path existing) throws IOException { + provider.createLink((link instanceof CloseSuppressPath ? ((CloseSuppressPath) link).getJavaPath() : link), existing); + } + + @Override + public void delete(Path path) throws IOException { + provider.delete((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path)); + } + + @Override + public boolean deleteIfExists(Path path) throws IOException { + return provider.deleteIfExists((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path)); + } + + @Override + public Path readSymbolicLink(Path link) throws IOException { + return provider.readSymbolicLink((link instanceof CloseSuppressPath ? ((CloseSuppressPath) link).getJavaPath() : link)); + } + + @Override + public void copy(Path source, Path target, CopyOption... options) throws IOException { + provider.copy((source instanceof CloseSuppressPath ? ((CloseSuppressPath) source).getJavaPath() : source), + (target instanceof CloseSuppressPath ? ((CloseSuppressPath) target).getJavaPath() : target), options); + } + + @Override + public void move(Path source, Path target, CopyOption... options) throws IOException { + provider.move((source instanceof CloseSuppressPath ? ((CloseSuppressPath) source).getJavaPath() : source), + (target instanceof CloseSuppressPath ? ((CloseSuppressPath) target).getJavaPath() : target), options); + } + + @Override + public boolean isSameFile(Path path, Path path2) throws IOException { + return provider.isSameFile((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), + (path2 instanceof CloseSuppressPath ? ((CloseSuppressPath) path2).getJavaPath() : path2)); + } + + @Override + public boolean isHidden(Path path) throws IOException { + return provider.isHidden((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path)); + } + + @Override + public FileStore getFileStore(Path path) throws IOException { + return provider.getFileStore((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path)); + } + + @Override + public void checkAccess(Path path, AccessMode... modes) throws IOException { + provider.checkAccess((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), modes); + } + + @Override + public V getFileAttributeView(Path path, Class type, LinkOption... options) { + return provider.getFileAttributeView((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), type, options); + } + + @Override + public A readAttributes(Path path, Class type, LinkOption... options) throws IOException { + return provider.readAttributes((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), type, options); + } + + @Override + public Map readAttributes(Path path, String attributes, LinkOption... options) throws IOException { + return provider.readAttributes((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), attributes, options); + } + + @Override + public void setAttribute(Path path, String attribute, Object value, LinkOption... options) throws IOException { + provider.setAttribute((path instanceof CloseSuppressPath ? ((CloseSuppressPath) path).getJavaPath() : path), attribute, value, options); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressFileSystem.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressFileSystem.java new file mode 100644 index 00000000..ae7e0dc4 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressFileSystem.java @@ -0,0 +1,81 @@ +package org.rdfhdt.hdt.util.io; + +import java.io.IOException; +import java.nio.file.*; +import java.nio.file.attribute.UserPrincipalLookupService; +import java.nio.file.spi.FileSystemProvider; +import java.util.Set; + +/** + * {@link FileSystem} implementation for {@link CloseSuppressPath} + * + * @author Antoine Willerval + */ +public class CloseSuppressFileSystem extends FileSystem { + private final FileSystem fileSystem; + + + public CloseSuppressFileSystem(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + @Override + public FileSystemProvider provider() { + return new CloseSuppressFileProvider(fileSystem.provider()); + } + + @Override + public void close() throws IOException { + fileSystem.close(); + } + + @Override + public boolean isOpen() { + return fileSystem.isOpen(); + } + + @Override + public boolean isReadOnly() { + return fileSystem.isReadOnly(); + } + + @Override + public String getSeparator() { + return fileSystem.getSeparator(); + } + + @Override + public Iterable getRootDirectories() { + return CloseSuppressPath.of(fileSystem.getRootDirectories()); + } + + @Override + public Iterable getFileStores() { + return fileSystem.getFileStores(); + } + + @Override + public Set supportedFileAttributeViews() { + return fileSystem.supportedFileAttributeViews(); + } + + @Override + public Path getPath(String first, String... more) { + return CloseSuppressPath.of(fileSystem.getPath(first, more)); + } + + @Override + public PathMatcher getPathMatcher(String syntaxAndPattern) { + return fileSystem.getPathMatcher(syntaxAndPattern); + } + + @Override + public UserPrincipalLookupService getUserPrincipalLookupService() { + return fileSystem.getUserPrincipalLookupService(); + } + + @Override + public WatchService newWatchService() throws IOException { + return fileSystem.newWatchService(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java index 10c0095a..4258a062 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java @@ -8,19 +8,15 @@ import java.io.InputStream; import java.io.OutputStream; import java.net.URI; -import java.nio.file.FileSystem; -import java.nio.file.Files; -import java.nio.file.LinkOption; -import java.nio.file.Path; -import java.nio.file.WatchEvent; -import java.nio.file.WatchKey; -import java.nio.file.WatchService; +import java.nio.file.*; import java.util.Iterator; import java.util.Spliterator; import java.util.function.Consumer; /** * a file that delete itself when we close it + * + * @author Antoine Willerval */ public class CloseSuppressPath implements Path, Closeable { public static final int BUFFER_SIZE = 1 << 13; @@ -39,9 +35,41 @@ public static CloseSuppressPath of(Path component) { return component instanceof CloseSuppressPath ? (CloseSuppressPath) component : new CloseSuppressPath(component); } + private static Path extract(Path other) { + return (other instanceof CloseSuppressPath ? ((CloseSuppressPath) other).getJavaPath() : other); + } + + public static Iterable of(Iterable component) { + return () -> of(component.iterator()); + } + + public static Iterator of(Iterator it) { + return new Iterator<>() { + @Override + public boolean hasNext() { + return it.hasNext(); + } + + @Override + public CloseSuppressPath next() { + return of(it.next()); + } + + @Override + public void remove() { + it.remove(); + } + + @Override + public void forEachRemaining(Consumer action) { + it.forEachRemaining(p -> action.accept(of(p))); + } + }; + } + @Override public FileSystem getFileSystem() { - return wrapper.getFileSystem(); + return new CloseSuppressFileSystem(wrapper.getFileSystem()); } @Override @@ -50,18 +78,18 @@ public boolean isAbsolute() { } @Override - public Path getRoot() { - return wrapper.getRoot(); + public CloseSuppressPath getRoot() { + return of(wrapper.getRoot()); } @Override - public Path getFileName() { - return wrapper.getFileName(); + public CloseSuppressPath getFileName() { + return of(wrapper.getFileName()); } @Override - public Path getParent() { - return wrapper.getParent(); + public CloseSuppressPath getParent() { + return of(wrapper.getParent()); } @Override @@ -70,18 +98,18 @@ public int getNameCount() { } @Override - public Path getName(int index) { - return wrapper.getName(index); + public CloseSuppressPath getName(int index) { + return of(wrapper.getName(index)); } @Override - public Path subpath(int beginIndex, int endIndex) { - return wrapper.subpath(beginIndex, endIndex); + public CloseSuppressPath subpath(int beginIndex, int endIndex) { + return of(wrapper.subpath(beginIndex, endIndex)); } @Override public boolean startsWith(Path other) { - return wrapper.startsWith(other); + return wrapper.startsWith(extract(other)); } @Override @@ -91,7 +119,7 @@ public boolean startsWith(String other) { @Override public boolean endsWith(Path other) { - return wrapper.endsWith(other); + return wrapper.endsWith(extract(other)); } @Override @@ -100,13 +128,13 @@ public boolean endsWith(String other) { } @Override - public Path normalize() { - return wrapper.normalize(); + public CloseSuppressPath normalize() { + return of(wrapper.normalize()); } @Override public CloseSuppressPath resolve(Path other) { - return of(wrapper.resolve(other)); + return of(wrapper.resolve(extract(other))); } @Override @@ -116,7 +144,7 @@ public CloseSuppressPath resolve(String other) { @Override public CloseSuppressPath resolveSibling(Path other) { - return of(wrapper.resolveSibling(other)); + return of(wrapper.resolveSibling(extract(other))); } @Override @@ -126,7 +154,7 @@ public CloseSuppressPath resolveSibling(String other) { @Override public CloseSuppressPath relativize(Path other) { - return of(wrapper.relativize(other)); + return of(wrapper.relativize(extract(other))); } @Override @@ -135,8 +163,8 @@ public URI toUri() { } @Override - public Path toAbsolutePath() { - return wrapper.toAbsolutePath(); + public CloseSuppressPath toAbsolutePath() { + return of(wrapper.toAbsolutePath()); } @Override @@ -161,18 +189,18 @@ public WatchKey register(WatchService watcher, WatchEvent.Kind... events) thr @Override public Iterator iterator() { - return wrapper.iterator(); + return of(wrapper.iterator()); } @Override public int compareTo(Path other) { - return wrapper.compareTo(other); + return wrapper.compareTo(extract(other)); } @Override public boolean equals(Object other) { if (other instanceof CloseSuppressPath) { - return wrapper.equals(((CloseSuppressPath) other).wrapper); + return wrapper.equals(((CloseSuppressPath) other).getJavaPath()); } return wrapper.equals(other); } @@ -197,28 +225,20 @@ public Spliterator spliterator() { return wrapper.spliterator(); } - private InputStream openInputStream(boolean buffered) throws IOException { - if (buffered) { - return openInputStream(BUFFER_SIZE); - } else { - return Files.newInputStream(wrapper); - } + public InputStream openInputStream(int bufferSize, OpenOption... options) throws IOException { + return new BufferedInputStream(openInputStream(options), bufferSize); } - public InputStream openInputStream(int bufferSize) throws IOException { - return new BufferedInputStream(openInputStream(false), bufferSize); + public InputStream openInputStream(OpenOption... options) throws IOException { + return Files.newInputStream(this, options); } - private OutputStream openOutputStream(boolean buffered) throws IOException { - if (buffered) { - return openOutputStream(BUFFER_SIZE); - } else { - return Files.newOutputStream(wrapper); - } + private OutputStream openOutputStream(OpenOption... options) throws IOException { + return Files.newOutputStream(this, options); } - public OutputStream openOutputStream(int bufferSize) throws IOException { - return new BufferedOutputStream(openOutputStream(false), bufferSize); + public OutputStream openOutputStream(int bufferSize, OpenOption... options) throws IOException { + return new BufferedOutputStream(openOutputStream(options), bufferSize); } /** @@ -229,7 +249,7 @@ public void closeWithDeleteRecurse() { } public void mkdirs() throws IOException { - Files.createDirectories(wrapper); + Files.createDirectories(this); } public Path getJavaPath() { @@ -239,9 +259,9 @@ public Path getJavaPath() { @Override public void close() throws IOException { if (isDir) { - IOUtil.deleteDirRecurse(wrapper); + IOUtil.deleteDirRecurse(this); } else { - Files.deleteIfExists(wrapper); + Files.deleteIfExists(this); } } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java index b56b790a..979f56d7 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java @@ -199,7 +199,7 @@ public static InputStream getFileInputStream(String fileName) throws IOException public static InputStream getFileInputStream(String fileName, boolean uncompress) throws IOException { InputStream input; String name = fileName.toLowerCase(); - if (name.startsWith("http:/") || name.startsWith("ftp:/")) { + if (name.startsWith("http:/") || name.startsWith("https:/") || name.startsWith("ftp:/")) { URL url = new URL(fileName); URLConnection con = url.openConnection(); con.connect(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java index 0b5f0916..20e38217 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java @@ -3,6 +3,7 @@ import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.exceptions.CRCException; import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.IndexNodeDeltaMergeExceptionIterator; import org.rdfhdt.hdt.triples.IndexedNode; import org.rdfhdt.hdt.util.crc.CRC32; import org.rdfhdt.hdt.util.crc.CRC8; @@ -18,10 +19,11 @@ * * @author Antoine Willerval */ -public class CompressNodeReader implements ExceptionIterator, Closeable { +public class CompressNodeReader implements ExceptionIterator, IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher, Closeable { private final CRCInputStream stream; private final long size; private long index; + private int delta; private boolean waiting; private final IndexedNode last; private final ReplazableString tempString; @@ -55,7 +57,7 @@ public IndexedNode read() throws IOException { if (waiting) { return last; } - int delta = (int) VByte.decode(stream); + delta = (int) VByte.decode(stream); tempString.replace2(stream, delta); long index = VByte.decode(stream); last.setIndex(index); @@ -77,6 +79,21 @@ public IndexedNode next() throws IOException { pass(); return node; } + + @Override + public IndexedNode fetchNode() throws IOException { + if (hasNext()) { + return next(); + } else { + return null; + } + } + + @Override + public int lastDelta() { + return delta; + } + @Override public boolean hasNext() throws IOException { return index < size; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java index 4849a0a9..be9018ef 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java @@ -31,8 +31,8 @@ public class CompressUtil { /** * write a sorted list of indexed node * - * @param strings the nodes to write - * @param output the output + * @param strings the nodes to write + * @param output the output * @param listener the listener to see the progress * @throws IOException writing exception */ @@ -43,9 +43,9 @@ public static void writeCompressedSection(List strings, OutputStrea /** * write a sorted iterator of indexed node * - * @param it iterator to write - * @param size size of the iterator - * @param output the output where to write + * @param it iterator to write + * @param size size of the iterator + * @param output the output where to write * @param listener the listener to see the progress * @throws IOException writing exception */ @@ -70,9 +70,9 @@ public static void writeCompressedSection(ExceptionIterator nodes, DuplicatedNodeConsumer duplicatedNodeConsumer) { return new DuplicatedIterator(nodes.asIterator(), duplicatedNodeConsumer); } + /** + * Duplicate consumer for the {@link #asNoDupeCharSequenceIterator(org.rdfhdt.hdt.iterator.utils.ExceptionIterator, org.rdfhdt.hdt.util.io.compress.CompressUtil.DuplicatedNodeConsumer)} method + */ @FunctionalInterface public interface DuplicatedNodeConsumer { + /** + * called when the {@link org.rdfhdt.hdt.util.io.compress.CompressUtil.DuplicatedIterator} find a duplicated element + * + * @param originalIndex the index id of the first element + * @param duplicatedIndex the index id of the duplicate element + * @param originalHeader the header of the first element + */ void onDuplicated(long originalIndex, long duplicatedIndex, long originalHeader); } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java index cbedd70e..60ef66ec 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -15,6 +15,7 @@ import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTOptionsKeys; @@ -582,27 +583,24 @@ public void generateTest() throws IOException, ParserException, NotFoundExceptio String ntFile = Objects.requireNonNull(getClass().getClassLoader().getResource(file), "Can't find " + file).getFile(); // create DISK HDT try (InputStream in = IOUtil.getFileInputStream(ntFile)) { - Iterator it = RDFParserFactory.readAsIterator( + try (PipedCopyIterator it = RDFParserFactory.readAsIterator( RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES, true), in, HDTTestUtils.BASE_URI, true, RDFNotation.NTRIPLES - ); - HDT expected = HDTManager.generateHDT( - it, - HDTTestUtils.BASE_URI, - spec, - quiet ? null : this - ); - - String testCopy = tempDir.newFile().getAbsolutePath(); - expected.saveToHDT(testCopy, null); - - // create MEMORY HDT - HDT actual = HDTManager.loadHDT(testCopy); - - try { - assertEqualsHDT(expected, actual); - } finally { - IOUtil.closeAll(expected, actual); + )) { + try (HDT expected = HDTManager.generateHDT( + it, + HDTTestUtils.BASE_URI, + spec, + quiet ? null : this + )) { + String testCopy = tempDir.newFile().getAbsolutePath(); + expected.saveToHDT(testCopy, null); + + // create MEMORY HDT + try (HDT actual = HDTManager.loadHDT(testCopy)) { + assertEqualsHDT(expected, actual); + } + } } } } @@ -620,6 +618,8 @@ public void bigDiskTest() throws ParserException, IOException { HDTOptions spec = new HDTSpecification(); spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, output.resolve("future.hdt").toAbsolutePath().toString()); spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, output.resolve("gen_dir").toAbsolutePath().toString()); + spec.set(HDTOptionsKeys.NT_SIMPLE_PARSER_KEY, "true"); + spec.set(HDTOptionsKeys.PROFILER_KEY, "true"); StopWatch watch = new StopWatch(); watch.reset(); try (HDT hdt = HDTManager.generateHDTDisk(supplier.createTripleStringStream(), "http://ex.ogr/#", spec, @@ -634,10 +634,11 @@ public void bigCatTreeDiskTest() throws ParserException, IOException { HDTOptions spec = new HDTSpecification(); StopWatch watch = new StopWatch(); spec.set(HDTOptionsKeys.LOADER_CATTREE_LOCATION_KEY, "C:\\WIKI\\CATTREE\\WORKING"); - spec.set(HDTOptionsKeys.LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY, "C:\\ISWC\\CATTREE\\future.hdt"); + spec.set(HDTOptionsKeys.LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY, "C:\\WIKI\\CATTREE\\future.hdt"); spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, "C:\\WIKI\\CATTREE\\WORKING_HDTDISK"); spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY, "12"); spec.set(HDTOptionsKeys.NT_SIMPLE_PARSER_KEY, "true"); + spec.set(HDTOptionsKeys.PROFILER_KEY, "true"); watch.reset(); try (HDT hdt = HDTManager.catTree( RDFFluxStop.sizeLimit(100_000_000_000L) // 300GB free @@ -650,5 +651,29 @@ public void bigCatTreeDiskTest() throws ParserException, IOException { System.out.println(hdt.getTriples().getNumberOfElements()); } } + @Test + public void bigGenCatTreeDiskTest() throws ParserException, IOException { + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(10_000_000_000L, 94); + HDTOptions spec = new HDTSpecification(); + StopWatch watch = new StopWatch(); + spec.set(HDTOptionsKeys.LOADER_CATTREE_LOCATION_KEY, "C:\\WIKI\\CATTREE\\WORKING"); + spec.set(HDTOptionsKeys.LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY, "C:\\WIKI\\CATTREE\\future.hdt"); + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, "C:\\WIKI\\CATTREE\\WORKING_HDTDISK"); + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY, "12"); + spec.set(HDTOptionsKeys.NT_SIMPLE_PARSER_KEY, "true"); + spec.set(HDTOptionsKeys.PROFILER_KEY, "true"); + watch.reset(); + try (HDT hdt = HDTManager.catTree( + RDFFluxStop.sizeLimit(100_000_000_000L) // 300GB free + .and(RDFFluxStop.countLimit(700_000_000L) // ~9GB maps + ), HDTSupplier.disk(), + supplier.createTripleStringStream(), HDTTestUtils.BASE_URI, spec, + (level, message) -> System.out.println("[" + level + "] " + message) + )) { + System.out.println(watch.stopAndShow()); + System.out.println(hdt.getTriples().getNumberOfElements()); + } + } } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java new file mode 100644 index 00000000..097192a5 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java @@ -0,0 +1,348 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.junit.Test; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.AssertionCharSequence; + +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.rdfhdt.hdt.iterator.utils.IndexNodeDeltaMergeExceptionIterator.compareToDelta; + +public class IndexNodeDeltaMergeExceptionIteratorTest { + + @Test + public void compareToDeltaTest() { + assertEquals( + 3, + compareToDelta( + 2, + new IndexedNode("aaa", 0), + new IndexedNode("aab", 1) + )); + assertEquals( + -3, + compareToDelta( + 2, + new IndexedNode("aab", 1), + new IndexedNode("aaa", 0) + )); + assertEquals( + 3, + compareToDelta( + 2, + new IndexedNode("aaad", 0), + new IndexedNode("aabd", 1) + )); + assertEquals( + -3, + compareToDelta( + 2, + new IndexedNode("aabd", 1), + new IndexedNode("aaad", 0) + )); + assertEquals( + 4, + compareToDelta( + 2, + new IndexedNode("aaaad", 0), + new IndexedNode("aaabd", 1) + )); + assertEquals( + -4, + compareToDelta( + 2, + new IndexedNode("aaabd", 1), + new IndexedNode("aaaad", 0) + )); + + assertEquals( + 3, + compareToDelta( + 0, + new IndexedNode("aa", 1), + new IndexedNode("aa", 0) + )); + } + + private final AtomicLong ids = new AtomicLong(); + + public IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher createFromSortedArray(AssertionCharSequence... array) { + List elements = Stream.of(array).map(s -> new IndexedNode(s, ids.incrementAndGet())).collect(Collectors.toList()); + + return new IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher<>() { + int index = 0; + int delta; + + @Override + public IndexedNode fetchNode() throws RuntimeException { + if (elements.size() <= index) { + return null; + } + + IndexedNode next = elements.get(index++); + + delta = 0; + + if (index > 1) { + IndexedNode prev = elements.get(index - 2); + CharSequence s1 = ((AssertionCharSequence) prev.getNode()).getSequence(); + CharSequence s2 = ((AssertionCharSequence) next.getNode()).getSequence(); + + delta = 0; + int len = Math.min(s1.length(), s2.length()); + while (delta < len && s1.charAt(delta) == s2.charAt(delta)) { + delta++; + } + } + return next; + } + + @Override + public int lastDelta() throws RuntimeException { + return delta; + } + }; + } + + @Test + public void deltaComputeTest() { + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it = createFromSortedArray( + new AssertionCharSequence("", 0), + new AssertionCharSequence("", 0), + new AssertionCharSequence("aaa", 0), + new AssertionCharSequence("aab", 0), + new AssertionCharSequence("aac", 0), + new AssertionCharSequence("aacd", 0), + new AssertionCharSequence("abcd", 0), + new AssertionCharSequence("bbcd", 0) + ); + + assertEquals("", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(0, it.lastDelta()); + + assertEquals("", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(0, it.lastDelta()); + + assertEquals("aaa", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(0, it.lastDelta()); + + assertEquals("aab", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(2, it.lastDelta()); + + assertEquals("aac", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(2, it.lastDelta()); + + assertEquals("aacd", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(3, it.lastDelta()); + + assertEquals("abcd", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(1, it.lastDelta()); + + assertEquals("bbcd", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(0, it.lastDelta()); + + assertNull(it.fetchNode()); + } + + @Test + public void mergeComputeTest() { + List output = List.of( + "", + "a", + "b" + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it1 = + createFromSortedArray( + new AssertionCharSequence("a", 0) + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it2 = + createFromSortedArray( + new AssertionCharSequence("", 0), + new AssertionCharSequence("b", 0) + ); + + ExceptionIterator it = IndexNodeDeltaMergeExceptionIterator.buildOfTree( + List.of(it1, it2), + 0, + 2 + ); + + Iterator itE = output.iterator(); + while (it.hasNext()) { + CharSequence sequence = ((AssertionCharSequence) it.next().getNode()).getSequence(); + assertTrue("missing: " + sequence, itE.hasNext()); + assertEquals(itE.next(), sequence); + } + assertFalse(it.hasNext()); + } + + + @Test + public void mergeCountComputeTest() { + List output = List.of( + "", + "aaa", + "aab", + "aabb", + "aacd", + "aacde", + "bacd", + "bacdd", + "bacde", + "bacdz" + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it1 = + createFromSortedArray( + new AssertionCharSequence("aaa", 0), + new AssertionCharSequence("aacde", 0), + new AssertionCharSequence("bacd", 0), + new AssertionCharSequence("bacdd", 0) + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it2 = + createFromSortedArray( + new AssertionCharSequence("", 0), + new AssertionCharSequence("aab", 0), + new AssertionCharSequence("aabb", 0), + new AssertionCharSequence("aacd", 0), + new AssertionCharSequence("bacde", 0), + new AssertionCharSequence("bacdz", 0) + ); + + ExceptionIterator it = IndexNodeDeltaMergeExceptionIterator.buildOfTree( + List.of(it1, it2), + 0, + 2 + ); + + Iterator itE = output.iterator(); + while (it.hasNext()) { + CharSequence sequence = ((AssertionCharSequence) it.next().getNode()).getSequence(); + assertTrue("missing: " + sequence, itE.hasNext()); + assertEquals(itE.next(), sequence); + } + assertFalse(itE.hasNext()); + } + + + @Test + public void deepMergeComputeTest() { + List output = List.of( + "", + "aa", + "aaa", + "aaa", + "aab", + "aabb", + "aabb", + "aacc", + "aacd", + "aacde", + "aacdef", + "b", + "bcd", + "bcz", + "bz", + "bze", + "cd", + "ce" + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it1 = + createFromSortedArray( + new AssertionCharSequence("aa", 0), + new AssertionCharSequence("aaa", 0), + new AssertionCharSequence("aaa", 0), + new AssertionCharSequence("aabb", 0), + new AssertionCharSequence("aacc", 0) + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it2 = + createFromSortedArray( + new AssertionCharSequence("aab", 0), + new AssertionCharSequence("aabb", 0), + new AssertionCharSequence("aacd", 0), + new AssertionCharSequence("b", 0), + new AssertionCharSequence("bcd", 0), + new AssertionCharSequence("bcz", 0), + new AssertionCharSequence("bz", 0), + new AssertionCharSequence("bze", 0) + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it3 = + createFromSortedArray( + new AssertionCharSequence("", 0), + new AssertionCharSequence("aacde", 0) + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it4 = + createFromSortedArray( + new AssertionCharSequence("cd", 0), + new AssertionCharSequence("ce", 0) + ); + IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it5 = + createFromSortedArray( + new AssertionCharSequence("aacdef", 0) + ); + + ExceptionIterator it = IndexNodeDeltaMergeExceptionIterator.buildOfTree( + List.of(it1, it2, it3, it4, it5), + 0, + 5 + ); + + ((IndexNodeDeltaMergeExceptionIterator) it).printMergeTree(); + + Iterator itE = output.iterator(); + while (it.hasNext()) { + assertTrue(itE.hasNext()); + CharSequence seq = ((AssertionCharSequence) it.next().getNode()).getSequence(); + System.out.println(seq); + assertEquals(itE.next(), seq); + } + assertFalse(itE.hasNext()); + } + + @Test + public void largeTest() { + // (tried with 200_000) + final long size = 2_000; + Random random = new Random(35); + List randy = Stream.generate(() -> { + String table = "abcd"; + StringBuilder bld = new StringBuilder(); + // +1 because we don't have empty strings during this step + int bn = 1 + random.nextInt(20); + while (bn > 0) { + bld.append(table.charAt(bn % table.length())); + bn /= table.length(); + } + return bld.toString(); + }) + .limit(size) + .collect(Collectors.toList()); + List sortedRandy = randy.stream().sorted().collect(Collectors.toList()); + + assertEquals(size, sortedRandy.size()); + + ExceptionIterator it = IndexNodeDeltaMergeExceptionIterator.buildOfTree( + s -> createFromSortedArray(new AssertionCharSequence(s, 0)), randy, + 0, + randy.size() + ); + + int index = 0; + Iterator itE = sortedRandy.iterator(); + while (it.hasNext()) { + assertTrue(itE.hasNext()); + CharSequence actual = ((AssertionCharSequence) it.next().getNode()).getSequence(); + assertEquals("Difference with index #" + index++, itE.next(), actual); + } + assertFalse(itE.hasNext()); + + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/RDFFluxStopTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/RDFFluxStopTest.java new file mode 100644 index 00000000..1af5212c --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/RDFFluxStopTest.java @@ -0,0 +1,65 @@ +package org.rdfhdt.hdt.rdf; + +import org.junit.Test; +import org.rdfhdt.hdt.options.HDTOptionsKeys; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +public class RDFFluxStopTest { + private void assertExportSame(RDFFluxStop flux) { + assertEquals(flux, RDFFluxStop.readConfig(flux.asConfig())); + } + + @Test + public void optionTest() { + assertEquals(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_NO_LIMIT + ":0", RDFFluxStop.noLimit().asConfig()); + assertEquals(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":42", RDFFluxStop.countLimit(42).asConfig()); + assertEquals(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":34", RDFFluxStop.sizeLimit(34).asConfig()); + + + assertEquals( + "(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":42)&(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":34)", + RDFFluxStop.countLimit(42).and(RDFFluxStop.sizeLimit(34)).asConfig() + ); + assertEquals( + "(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":42)|(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":34)", + RDFFluxStop.countLimit(42).or(RDFFluxStop.sizeLimit(34)).asConfig() + ); + assertEquals( + RDFFluxStop.readConfig("(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":42)&(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":34)"), + RDFFluxStop.countLimit(42).and(RDFFluxStop.sizeLimit(34)) + ); + assertEquals( + RDFFluxStop.readConfig("(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":42)|(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":34)"), + RDFFluxStop.countLimit(42).or(RDFFluxStop.sizeLimit(34)) + ); + + assertExportSame(RDFFluxStop.countLimit(42).or(RDFFluxStop.sizeLimit(34))); + assertExportSame((RDFFluxStop.countLimit(42).and(RDFFluxStop.countLimit(1))).or(RDFFluxStop.sizeLimit(34).and(RDFFluxStop.noLimit())).and(RDFFluxStop.countLimit(23))); + + assertNull(RDFFluxStop.readConfig("")); + assertNull(RDFFluxStop.readConfig(null)); + assertNull(RDFFluxStop.readConfig("()")); + } + + @Test(expected = IllegalArgumentException.class) + public void badSyntax() { + RDFFluxStop.readConfig("noLimit"); + } + + @Test(expected = IllegalArgumentException.class) + public void badSyntax2() { + RDFFluxStop.readConfig("noLimit:2&z"); + } + + @Test(expected = IllegalArgumentException.class) + public void badSyntax3() { + RDFFluxStop.readConfig("''"); + } + + @Test(expected = IllegalArgumentException.class) + public void badSyntax4() { + RDFFluxStop.readConfig("a:b"); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java index cfc2be32..e3cd825e 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java @@ -27,22 +27,22 @@ public void streamTest() throws IOException { triples.createNTFile(testNt.toAbsolutePath().toString()); try (InputStream is = Files.newInputStream(testNt)) { - PipedCopyIterator it = RDFParserFactory.readAsIterator( + try (PipedCopyIterator it = RDFParserFactory.readAsIterator( RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES), is, HDTTestUtils.BASE_URI, true, RDFNotation.NTRIPLES - ); - - it.forEachRemaining(s -> { - try { - Thread.sleep(50); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - System.out.println(s + " " + s.getSubject().getClass()); - }); + )) { + it.forEachRemaining(s -> { + try { + Thread.sleep(50); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + System.out.println(s + " " + s.getSubject().getClass()); + }); + } } } } \ No newline at end of file diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/ProfilerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/ProfilerTest.java new file mode 100644 index 00000000..b00cf010 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/ProfilerTest.java @@ -0,0 +1,139 @@ +package org.rdfhdt.hdt.util; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; + +import static org.junit.Assert.*; + +public class ProfilerTest { + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + @Test + public void ioTest() throws IOException, InterruptedException { + Path root = tempDir.getRoot().toPath(); + + Profiler profiler = new Profiler("test"); + profiler.pushSection("tests1"); + { + profiler.pushSection("tests1s1"); + { + Thread.sleep(25L); + } + profiler.popSection(); + + profiler.pushSection("tests1s2"); + { + Thread.sleep(5L); + } + profiler.popSection(); + + profiler.pushSection("tests1s3"); + { + profiler.pushSection("tests1s3s1"); + { + Thread.sleep(5L); + } + profiler.popSection(); + } + profiler.popSection(); + } + profiler.popSection(); + profiler.pushSection("tests2"); + { + Thread.sleep(5L); + } + profiler.popSection(); + + profiler.stop(); + profiler.writeProfiling(); + + Path profiling = root.resolve("profiling"); + profiler.writeToDisk(profiling); + + Profiler p2 = Profiler.readFromDisk(profiling); + + assertEquals(profiler.getMainSection(), p2.getMainSection()); + } + + @Test + public void structTest() throws InterruptedException { + Profiler profiler = new Profiler("test"); + profiler.pushSection("tests1"); + { + profiler.pushSection("tests1s1"); + { + Thread.sleep(25L); + } + profiler.popSection(); + + profiler.pushSection("tests1s2"); + { + Thread.sleep(5L); + } + profiler.popSection(); + + profiler.pushSection("tests1s3"); + { + profiler.pushSection("tests1s3s1"); + { + Thread.sleep(5L); + } + profiler.popSection(); + } + profiler.popSection(); + } + profiler.popSection(); + profiler.pushSection("tests2"); + { + Thread.sleep(5L); + } + profiler.popSection(); + + profiler.stop(); + + Profiler.Section test = profiler.getMainSection(); + assertEquals("test", test.getName()); + List testSub = test.getSubSections(); + assertEquals(2, testSub.size()); + + Profiler.Section tests1 = testSub.get(0); + assertEquals("tests1", tests1.getName()); + List tests1Sub = tests1.getSubSections(); + assertEquals(3, tests1Sub.size()); + + Profiler.Section tests1s1 = tests1Sub.get(0); + assertEquals("tests1s1", tests1s1.getName()); + List tests1s1Sub = tests1s1.getSubSections(); + assertEquals(0, tests1s1Sub.size()); + + Profiler.Section tests1s2 = tests1Sub.get(1); + assertEquals("tests1s2", tests1s2.getName()); + List tests1s2Sub = tests1s2.getSubSections(); + assertEquals(0, tests1s2Sub.size()); + + Profiler.Section tests1s3 = tests1Sub.get(2); + assertEquals("tests1s3", tests1s3.getName()); + List tests1s3Sub = tests1s3.getSubSections(); + assertEquals(1, tests1s3Sub.size()); + + Profiler.Section tests1s3s1 = tests1s3Sub.get(0); + assertEquals("tests1s3s1", tests1s3s1.getName()); + assertEquals(0, tests1s3s1.getSubSections().size()); + + Profiler.Section tests2 = testSub.get(1); + assertEquals("tests2", tests2.getName()); + assertEquals(0, tests2.getSubSections().size()); + } + + @Test(expected = IllegalArgumentException.class) + public void popTest() { + Profiler p = new Profiler(""); + + p.popSection(); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/CloseSuppressPathTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/CloseSuppressPathTest.java new file mode 100644 index 00000000..3ffd2382 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/CloseSuppressPathTest.java @@ -0,0 +1,54 @@ +package org.rdfhdt.hdt.util.io; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.Assert.*; + +public class CloseSuppressPathTest { + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + @Test + public void createDelTest() throws IOException { + Path path = tempDir.getRoot().toPath(); + + CloseSuppressPath test = CloseSuppressPath.of(path.resolve("test")); + + Files.writeString(test, "test"); + assertTrue(Files.exists(test)); + test.close(); + assertFalse(Files.exists(test)); + } + + @Test + public void createDelRecTest() throws IOException { + Path path = tempDir.getRoot().toPath(); + + CloseSuppressPath test = CloseSuppressPath.of(path.resolve("test")); + test.closeWithDeleteRecurse(); + Files.createDirectories(test); + + CloseSuppressPath test2 = test.resolve("test2"); + Files.writeString(test2, "test"); + assertTrue(Files.exists(test)); + assertTrue(Files.exists(test2)); + test.close(); + assertFalse(Files.exists(test2)); + assertFalse(Files.exists(test)); + } + + @Test + public void pathTest() { + Path path = tempDir.getRoot().toPath(); + + assertEquals(CloseSuppressPath.of(path), path); + // known unresolvable issue + assertNotEquals(path, CloseSuppressPath.of(path)); + } +} \ No newline at end of file diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java index 791f3a3a..2100e667 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java @@ -155,7 +155,7 @@ public void closeablePathTest() throws IOException { Path p1 = p.resolve("test1"); try (CloseSuppressPath csp = CloseSuppressPath.of(p1)) { - Files.writeString(csp.getJavaPath(), "test"); + Files.writeString(csp, "test"); Assert.assertTrue(Files.exists(p1)); } Assert.assertFalse(Files.exists(p1)); @@ -164,7 +164,7 @@ public void closeablePathTest() throws IOException { Path p2 = p.resolve("test2"); try (CloseSuppressPath csp = CloseSuppressPath.of(p2)) { csp.closeWithDeleteRecurse(); - Path p3 = csp.getJavaPath().resolve("test3/test4/test5"); + Path p3 = csp.resolve("test3/test4/test5"); Path f4 = p3.resolve("child.txt"); Files.createDirectories(p3); Files.writeString(f4, "hello world"); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequence.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequence.java new file mode 100644 index 00000000..ee024ab5 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequence.java @@ -0,0 +1,62 @@ +package org.rdfhdt.hdt.util.string; + +import java.util.stream.IntStream; + +/** + * CharSequence wrapper throwing an {@link java.lang.AssertionError} if we try to read before the minimum index + */ +public class AssertionCharSequence implements CharSequence { + private final CharSequence sequence; + private final int minimumRead; + + /** + * create an assertion cs + * + * @param sequence wrapped sequence + * @param minimumRead minimum index to read (inclusive) + */ + public AssertionCharSequence(CharSequence sequence, int minimumRead) { + this.sequence = sequence; + this.minimumRead = minimumRead; + } + + @Override + public int length() { + return sequence.length(); + } + + @Override + public char charAt(int index) { + if (index < minimumRead) { + throw new AssertionError("Tried to read before minimum index! " + index + " / " + minimumRead); + } + return sequence.charAt(index); + } + + @Override + public CharSequence subSequence(int start, int end) { + if (start < minimumRead) { + throw new AssertionError("Tried to create subSequence before minimum index! " + start + " / " + minimumRead); + } + return sequence.subSequence(start, end); + } + + @Override + public String toString() { + throw new AssertionError("Tried to convert an AssertionCharSequence to string!"); + } + + @Override + public IntStream chars() { + return sequence.chars(); + } + + @Override + public IntStream codePoints() { + return sequence.codePoints(); + } + + public CharSequence getSequence() { + return sequence; + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequenceTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequenceTest.java new file mode 100644 index 00000000..7ef6efd8 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequenceTest.java @@ -0,0 +1,43 @@ +package org.rdfhdt.hdt.util.string; + +import org.junit.Test; + +import static org.junit.Assert.*; + +public class AssertionCharSequenceTest { + @Test + public void afterBoundTest() { + AssertionCharSequence seq = new AssertionCharSequence("aaabbb", 3); + + seq.charAt(3); + seq.charAt(4); + seq.charAt(5); + + seq.subSequence(3, 6); + } + + @Test(expected = AssertionError.class) + public void beforeBoundTest() { + + AssertionCharSequence seq = new AssertionCharSequence("aaabbb", 3); + + seq.charAt(2); + } + + @Test(expected = AssertionError.class) + public void toStringTest() { + + AssertionCharSequence seq = new AssertionCharSequence("aaabbb", 3); + + assertNotNull(seq.toString()); + } + + @Test(expected = AssertionError.class) + public void subTest() { + + AssertionCharSequence seq = new AssertionCharSequence("aaabbb", 3); + + seq.subSequence(0, 3); + } + +} From da187f8900a900fee31375e06b1345babd3b120b Mon Sep 17 00:00:00 2001 From: qaate47 Date: Wed, 26 Oct 2022 17:13:43 +0200 Subject: [PATCH 4/9] Start MultipleDictionary implementation for disk generation, fix delta merger, cleanup code, better ByteString handle --- .../org/rdfhdt/hdt/dictionary/Dictionary.java | 52 +- .../hdt/dictionary/DictionarySection.java | 6 +- .../org/rdfhdt/hdt/enums/RDFNotation.java | 50 +- .../java/org/rdfhdt/hdt/hdt/HDTVersion.java | 4 +- .../org/rdfhdt/hdt/options/HDTOptions.java | 201 ++++- .../rdfhdt/hdt/options/HDTOptionsKeys.java | 73 +- .../java/org/rdfhdt/hdt/rdf/RDFFluxStop.java | 730 +++++++++--------- .../java/org/rdfhdt/hdt/rdf/TripleWriter.java | 2 +- .../java/org/rdfhdt/hdt/triples/TripleID.java | 6 +- .../org/rdfhdt/hdt/triples/TripleString.java | 4 +- .../java/org/rdfhdt/hdt/tools/HDT2RDF.java | 6 +- .../java/org/rdfhdt/hdt/tools/HDTCat.java | 20 +- .../java/org/rdfhdt/hdt/tools/RDF2HDT.java | 5 +- .../org/rdfhdt/hdt/cache/DictionaryCache.java | 8 +- .../hdt/cache/DictionaryCacheArray.java | 5 +- .../hdt/cache/DictionaryCacheArrayWeak.java | 4 +- .../rdfhdt/hdt/cache/DictionaryCacheHash.java | 2 +- .../rdfhdt/hdt/cache/DictionaryCacheLRI.java | 6 +- .../rdfhdt/hdt/cache/DictionaryCacheLRU.java | 2 +- .../hdt/compact/bitmap/AdjacencyList.java | 6 +- .../rdfhdt/hdt/compact/bitmap/Bitmap375.java | 6 +- .../hdt/compact/bitmap/Bitmap64Disk.java | 2 +- .../org/rdfhdt/hdt/compact/integer/VByte.java | 8 +- .../compact/sequence/SequenceLog64Map.java | 6 +- .../rdfhdt/hdt/dictionary/DictionaryDiff.java | 4 +- .../hdt/dictionary/DictionaryFactory.java | 154 +++- .../rdfhdt/hdt/dictionary/TempDictionary.java | 6 +- .../hdt/dictionary/TempDictionarySection.java | 10 + .../hdt/dictionary/impl/BaseDictionary.java | 4 +- .../dictionary/impl/BaseTempDictionary.java | 4 +- .../impl/CompressFourSectionDictionary.java | 122 +-- .../impl/FourSectionDictionaryDiff.java | 8 +- .../hdt/dictionary/impl/HashDictionary.java | 10 +- .../MultDictionaryPFCOptimizedExtractor.java | 32 +- .../impl/MultipleBaseDictionary.java | 153 ++-- .../impl/MultipleSectionDictionary.java | 396 +++++----- .../impl/MultipleSectionDictionaryBig.java | 87 +-- .../impl/MultipleSectionDictionaryCat.java | 122 +-- .../impl/MultipleSectionDictionaryDiff.java | 76 +- .../impl/WriteMultipleSectionDictionary.java | 200 +++++ .../impl/section/HashDictionarySection.java | 82 +- .../section/OneReadDictionarySection.java | 30 +- .../impl/section/PFCDictionarySection.java | 2 +- .../section/PFCDictionarySectionBuilder.java | 4 +- .../impl/section/WriteDictionarySection.java | 18 +- .../impl/utilCat/CatMappingBack.java | 41 +- .../dictionary/impl/utilCat/CatWrapper.java | 5 +- .../dictionary/impl/utilCat/SectionUtil.java | 10 +- .../dictionary/impl/utilDiff/DiffWrapper.java | 4 +- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 316 ++------ .../rdfhdt/hdt/hdt/impl/HDTDiskImporter.java | 364 +++++++++ .../java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java | 84 +- .../org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java | 9 +- .../impl/diskimport/CompressTripleMapper.java | 6 + .../impl/diskimport/SectionCompressor.java | 18 +- .../hdt/hdt/writer/TripleWriterHDT.java | 2 +- .../hdt/hdt/writer/TripleWriterNtriples.java | 2 +- .../IndexNodeDeltaMergeExceptionIterator.java | 8 +- .../iterator/utils/ListTripleIDIterator.java | 8 +- .../hdt/iterator/utils/PipedCopyIterator.java | 9 + .../iterator/utils/RepeatApplyIterator.java | 4 +- .../hdt/iterator/utils/SeveralIterator.java | 2 +- .../rdfhdt/hdt/iterator/utils/SideEffect.java | 2 +- .../rdfhdt/hdt/options/HDTOptionsBase.java | 12 +- .../rdfhdt/hdt/rdf/parsers/RDFParserRAR.java | 5 +- .../rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java | 2 +- .../hdt/triples/DictionaryEntriesDiff.java | 2 +- .../hdt/triples/TripleIDComparatorInt.java | 2 +- .../hdt/triples/impl/BitmapTriples.java | 12 +- .../impl/BitmapTriplesIteratorCat.java | 6 +- .../triples/impl/BitmapTriplesIteratorY.java | 11 +- .../impl/BitmapTriplesIteratorYFOQ.java | 23 +- .../triples/impl/BitmapTriplesIteratorZ.java | 12 +- .../FourSectionDictionaryEntriesDiff.java | 4 +- .../MultipleSectionDictionaryEntriesDiff.java | 13 +- .../hdt/triples/impl/PredicateIndex.java | 12 +- .../rdfhdt/hdt/triples/impl/TripleIDInt.java | 12 +- .../hdt/triples/impl/TriplesListLong.java | 4 +- .../org/rdfhdt/hdt/util/CustomIterator.java | 12 +- .../org/rdfhdt/hdt/util/LiteralsUtils.java | 401 ++++++++-- .../java/org/rdfhdt/hdt/util/Profiler.java | 16 +- .../org/rdfhdt/hdt/util/ProfilingUtil.java | 8 +- .../java/org/rdfhdt/hdt/util/RDFInfo.java | 7 +- .../hdt/util/concurrent/ExceptionThread.java | 96 ++- .../hdt/util/concurrent/KWayMerger.java | 5 +- .../rdfhdt/hdt/util/io/CloseInputStream.java | 4 +- .../java/org/rdfhdt/hdt/util/io/IOUtil.java | 40 +- .../util/io/compress/CompressNodeReader.java | 5 + .../hdt/util/io/compress/CompressUtil.java | 7 +- .../hdt/util/listener/ListenerUtil.java | 2 +- .../rdfhdt/hdt/util/string/ByteString.java | 30 + .../hdt/util/string/ByteStringUtil.java | 57 +- .../util/string/CharSequenceComparator.java | 14 +- .../string/CharSequenceCustomComparator.java | 37 +- .../rdfhdt/hdt/util/string/CompactString.java | 50 +- .../hdt/util/string/ReplazableString.java | 76 +- .../hdt/utils/DebugOrderNodeIterator.java | 71 ++ .../CompressFourSectionDictionaryTest.java | 9 +- .../section/OneReadDictionarySectionTest.java | 7 +- .../org/rdfhdt/hdt/hdt/HDTManagerTest.java | 406 ++++++++-- .../hdt/hdt/impl/TempHDTImporterTest.java | 41 +- .../rdfhdt/hdt/hdtCat/HdtCatLiteralsTest.java | 42 +- .../rdfhdt/hdt/hdtCat/HdtCatRandomTest.java | 5 +- .../org/rdfhdt/hdt/hdtCat/utils/Utility.java | 22 +- .../rdfhdt/hdt/hdtDiff/HdtDiffStaticTest.java | 5 +- .../org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java | 15 +- ...exNodeDeltaMergeExceptionIteratorTest.java | 1 - .../hdt/literalsDict/HDTLiteralsDictTest.java | 41 +- .../org/rdfhdt/hdt/rdf/RDFFluxStopTest.java | 13 +- .../impl/BitmapTriplesIteratorDiffTest.java | 10 +- .../BitmapTriplesIteratorPositionTest.java | 25 +- .../hdt/triples/impl/utils/HDTTestUtils.java | 148 +++- .../util/LargeFakeDataSetStreamSupplier.java | 243 +++++- .../LargeFakeDataSetStreamSupplierTest.java | 18 +- .../rdfhdt/hdt/util/LiteralsUtilsTest.java | 105 +++ .../util/io/compress/CompressNodeTest.java | 18 +- .../hdt/util/io/compress/CompressTest.java | 7 +- .../string/AssertionCharSequenceTest.java | 2 + .../test/resources/HdtCatLiteralsTest.java | 4 +- .../java/org/rdfhdt/hdtjena/DummyMap.java | 2 +- .../java/org/rdfhdt/hdtjena/HDTGraph.java | 8 +- .../org/rdfhdt/hdtjena/HDTGraphAssembler.java | 2 +- .../org/rdfhdt/hdtjena/NodeDictionary.java | 20 +- .../rdfhdt/hdtjena/solver/HDTOptimizedOp.java | 4 +- .../rdfhdt/hdtjena/solver/OpExecutorHDT.java | 14 +- .../hdtjena/solver/StageMatchTripleID.java | 68 +- 126 files changed, 3997 insertions(+), 2010 deletions(-) create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LiteralsUtilsTest.java diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/Dictionary.java b/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/Dictionary.java index d6c54e07..68b5c96e 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/Dictionary.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/Dictionary.java @@ -27,13 +27,12 @@ */ -import java.io.Closeable; -import java.util.HashMap; -import java.util.TreeMap; - import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.header.Header; +import java.io.Closeable; +import java.util.Map; + /** * Interface that specifies the basic methods for any Dictionary implementation @@ -53,7 +52,7 @@ public interface Dictionary extends Closeable { * TriplePosition of the id in the dictionary * @return String */ - public CharSequence idToString(long id, TripleComponentRole position); + CharSequence idToString(long id, TripleComponentRole position); /** * Returns the id for a given string @@ -64,11 +63,7 @@ public interface Dictionary extends Closeable { * TriplePosition of the string in the dictionary * @return int */ - public long stringToId(CharSequence str, TripleComponentRole position); - - /** - * Returns the number of elements in the dictionary - */ + long stringToId(CharSequence str, TripleComponentRole position); /** * Returns the data type of a given literal string @@ -77,54 +72,57 @@ public interface Dictionary extends Closeable { * The id to get the data type for * @return String */ - public String dataTypeOfId(long id); + CharSequence dataTypeOfId(long id); - public long getNumberOfElements(); + /** + * Returns the number of elements in the dictionary + */ + long getNumberOfElements(); /** * Return the combined size of the sections of the dictionary (in bytes) */ - public long size(); + long size(); /** * Returns the number of subjects in the dictionary. Note: Includes shared. */ - public long getNsubjects(); + long getNsubjects(); /** * Returns the number of predicates in the dictionary. */ - public long getNpredicates(); + long getNpredicates(); /** * Returns the number of objects in the dictionary. Note: Includes shared */ - public long getNobjects(); + long getNobjects(); /** * Returns the number of subjects/objects in the dictionary. */ - public long getNAllObjects(); - public long getNshared(); + long getNAllObjects(); + long getNshared(); - public DictionarySection getSubjects(); + DictionarySection getSubjects(); - public DictionarySection getPredicates(); + DictionarySection getPredicates(); - public DictionarySection getObjects(); + DictionarySection getObjects(); - public TreeMap getAllObjects(); + Map getAllObjects(); - public DictionarySection getShared(); + DictionarySection getShared(); /** * Fills the header with information from the dictionary */ - public void populateHeader(Header header, String rootNode); + void populateHeader(Header header, String rootNode); /** * Returns the type of the dictionary (the way it is written onto file/held in memory) - * @return + * @return type */ - public String getType(); -} \ No newline at end of file + String getType(); +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/DictionarySection.java b/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/DictionarySection.java index c6dd05d5..53da285e 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/DictionarySection.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/DictionarySection.java @@ -46,7 +46,7 @@ public interface DictionarySection extends Closeable { * @return * the corresponding ID in the dictionary */ - public long locate(CharSequence s); + long locate(CharSequence s); /** * Find the String associated to a given ID @@ -55,7 +55,7 @@ public interface DictionarySection extends Closeable { * @return * the corresponding string */ - public CharSequence extract(long pos); + CharSequence extract(long pos); /** * Size in bytes of the strings held in the dictionary section. @@ -67,7 +67,7 @@ public interface DictionarySection extends Closeable { * Number of entries in the dictionary section. * @return long */ - public long getNumberOfElements(); + long getNumberOfElements(); /** * Iterator over all entries in the dictionary, sorted lexicographically. diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/enums/RDFNotation.java b/hdt-api/src/main/java/org/rdfhdt/hdt/enums/RDFNotation.java index 32606ba5..8e545b1a 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/enums/RDFNotation.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/enums/RDFNotation.java @@ -110,26 +110,34 @@ public static RDFNotation parse(String str) { return NTRIPLES; } str = str.toLowerCase(); - if(str.equals("ntriples")||str.equals("nt")) { - return NTRIPLES; - } else if(str.equals("n3")) { - return N3; - } else if(str.equals("nq")||str.equals("nquad")) { - return NQUAD; - } else if(str.equals("rdfxml")||str.equals("rdf-xml") || str.equals("owl")) { - return RDFXML; - } else if(str.equals("turtle")) { - return TURTLE; - } else if(str.equals("rar")) { - return RAR; - } else if(str.equals("tar")||str.equals("tgz")||str.equals("tbz")||str.equals("tbz2")) { - return TAR; - } else if(str.equals("zip")) { - return ZIP; - } else if(str.equals("list")) { - return LIST; - } else if(str.equals("hdt")) { - return HDT; + switch (str) { + case "ntriples": + case "nt": + return NTRIPLES; + case "n3": + return N3; + case "nq": + case "nquad": + return NQUAD; + case "rdfxml": + case "rdf-xml": + case "owl": + return RDFXML; + case "turtle": + return TURTLE; + case "rar": + return RAR; + case "tar": + case "tgz": + case "tbz": + case "tbz2": + return TAR; + case "zip": + return ZIP; + case "list": + return LIST; + case "hdt": + return HDT; } throw new IllegalArgumentException(); } @@ -147,7 +155,7 @@ public static RDFNotation guess(String fileName) throws IllegalArgumentException int idx = str.lastIndexOf('.'); if(idx!=-1) { - String ext = str.substring(idx+1, str.length()); + String ext = str.substring(idx+1); if(ext.equals("gz") || ext.equals("bz") || ext.equals("bz2")|| ext.equals("xz")) { str = str.substring(0,idx); } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVersion.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVersion.java index 5795c624..607504f7 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVersion.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVersion.java @@ -17,9 +17,9 @@ public class HDTVersion { public static String get_version_string(String delimiter) { return "v" + HDT_VERSION + delimiter + INDEX_VERSION + delimiter + RELEASE_VERSION; - }; + } public static String get_index_suffix(String delimiter) { return ".index.v" + HDT_VERSION + delimiter+INDEX_VERSION; - }; + } } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java index f9ab4761..183f79d7 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java @@ -27,15 +27,208 @@ package org.rdfhdt.hdt.options; +import org.rdfhdt.hdt.rdf.RDFFluxStop; + +import java.util.Objects; +import java.util.function.DoubleSupplier; +import java.util.function.LongSupplier; +import java.util.function.Supplier; + /** + * Options storage, see {@link org.rdfhdt.hdt.options.HDTOptionsKeys} for more information. * @author mario.arias - * */ public interface HDTOptions { + + + /** + * clear all the options + */ + void clear(); + + /** + * get an option value + * + * @param key key + * @return value or null if not defined + */ String get(String key); + + /** + * get a value + * + * @param key key + * @param defaultValue default value + * @return value or defaultValue if the value isn't defined + */ + default String get(String key, String defaultValue) { + return Objects.requireNonNullElse(get(key), defaultValue); + } + + /** + * get a value + * + * @param key key + * @param defaultValue default value + * @return value or defaultValue if the value isn't defined + */ + default String get(String key, Supplier defaultValue) { + return Objects.requireNonNullElseGet(get(key), defaultValue); + } + + /** + * get a boolean + * + * @param key key + * @return boolean or false if the value isn't defined + */ + default boolean getBoolean(String key) { + return "true".equalsIgnoreCase(get(key)); + } + + /** + * get a double + * + * @param key key + * @return double or 0 if the value isn't defined + */ + default double getDouble(String key) { + return getDouble(key, 0); + } + + /** + * get a double + * + * @param key key + * @param defaultValue default value + * @return double or defaultValue if the value isn't defined + */ + default double getDouble(String key, DoubleSupplier defaultValue) { + String l = get(key); + if (l == null) { + return defaultValue.getAsDouble(); + } + return Double.parseDouble(l); + } + + /** + * get a double + * + * @param key key + * @param defaultValue default value + * @return double or defaultValue if the value isn't defined + */ + default double getDouble(String key, double defaultValue) { + return getDouble(key, () -> defaultValue); + } + + /** + * get an {@link org.rdfhdt.hdt.rdf.RDFFluxStop} + * + * @param key key + * @return RDFFluxStop or false if the value isn't defined + */ + default RDFFluxStop getFluxStop(String key) { + return RDFFluxStop.readConfig(get(key)); + } + + /** + * get an {@link org.rdfhdt.hdt.rdf.RDFFluxStop} + * + * @param key key + * @param defaultValue default value + * @return RDFFluxStop or defaultValue if the value isn't defined + */ + default RDFFluxStop getFluxStop(String key, Supplier defaultValue) { + return Objects.requireNonNullElseGet(getFluxStop(key), defaultValue); + } + + /** + * get an {@link org.rdfhdt.hdt.rdf.RDFFluxStop} + * + * @param key key + * @param defaultValue default value + * @return RDFFluxStop or defaultValue if the value isn't defined + */ + default RDFFluxStop getFluxStop(String key, RDFFluxStop defaultValue) { + return getFluxStop(key, () -> defaultValue); + } + + /** + * get a long value + * + * @param key key + * @return value or 0 if not defined + */ + long getInt(String key); + + /** + * get a long + * + * @param key key + * @param defaultValue default value + * @return long or defaultValue if the value isn't defined + */ + default long getInt(String key, LongSupplier defaultValue) { + long l = getInt(key); + if (l == 0) { + return defaultValue.getAsLong(); + } + return l; + } + + /** + * get a long + * + * @param key key + * @param defaultValue default value + * @return long or defaultValue if the value isn't defined + */ + default long getInt(String key, long defaultValue) { + return getInt(key, () -> defaultValue); + } + + /** + * set an option value + * + * @param key key + * @param value value + */ void set(String key, String value); - void setOptions(String options); - long getInt(String string); + + /** + * set a value, same as using {@link String#valueOf(Object)} with {@link #set(String, String)} + * + * @param key key + * @param value value + */ + default void set(String key, Object value) { + set(key, String.valueOf(value)); + } + + /** + * set a flux stop value, same as using {@link #set(String, String)} with {@link org.rdfhdt.hdt.rdf.RDFFluxStop#asConfig()} + * + * @param key key + * @param fluxStop value + */ + default void set(String key, RDFFluxStop fluxStop) { + set(key, fluxStop.asConfig()); + } + + /** + * set a long value + * + * @param key key + * @param value value + */ void setInt(String key, long value); - void clear(); + + /** + * read an option config, format: (key=value)?(;key=value)* + * + * @param options options + */ + void setOptions(String options); + } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java index af942c07..8ae50d20 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java @@ -1,5 +1,6 @@ package org.rdfhdt.hdt.options; +import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.rdf.RDFFluxStop; /** @@ -23,6 +24,7 @@ public class HDTOptionsKeys { * step, faster but increase the RAM usage. */ public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial"; + /** * Key for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, * say the number of workers to merge the data. default to the number of processor. long value. @@ -59,6 +61,12 @@ public class HDTOptionsKeys { * Key for the size of the buffers when opening a file */ public static final String LOADER_DISK_BUFFER_SIZE_KEY = "loader.disk.fileBufferSize"; + /** + * Key for {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDTDisk(java.util.Iterator, String, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)}, + * specify that the method doesn't have to copy the triple strings between 2 calls to the iterator, default false + */ + public static final String LOADER_DISK_NO_COPY_ITERATOR_KEY = "loader.disk.noCopyIterator"; + /** * Key for the loading mode of a RDF file for the * {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDT(String, String, org.rdfhdt.hdt.enums.RDFNotation, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)} @@ -82,6 +90,7 @@ public class HDTOptionsKeys { * Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value */ public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass"; + /** * Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} catTree methods, * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to @@ -103,6 +112,7 @@ public class HDTOptionsKeys { * split size of the RDFFluxStop in the generateHDT method. */ public static final String LOADER_CATTREE_MEMORY_FAULT_FACTOR = "loader.cattree.memoryFaultFactor"; + /** * Key for the hdt supplier type, default to memory */ @@ -131,7 +141,18 @@ public class HDTOptionsKeys { * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier */ public static final String RDF_FLUX_STOP_VALUE_NO_LIMIT = "no_limit"; - + /** + * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier + */ + public static final char RDF_FLUX_STOP_VALUE_OP_AND = '&'; + /** + * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier + */ + public static final char RDF_FLUX_STOP_VALUE_OP_OR = '|'; + /** + * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier + */ + public static final char RDF_FLUX_STOP_VALUE_OP_NOT = '!'; /** * Key for enabling the profiler (if implemented), default to false. Boolean value @@ -151,6 +172,56 @@ public class HDTOptionsKeys { */ public static final String TRIPLE_ORDER_KEY = "triplesOrder"; + /** + * Option to set how the HDTs are loaded in HDTCat/HDTDiff, default {@link #LOAD_HDT_TYPE_VALUE_MAP} + */ + public static final String LOAD_HDT_TYPE_KEY = "loader.hdt.type"; + /** + * load the HDT file into memory + */ + public static final String LOAD_HDT_TYPE_VALUE_LOAD = "load"; + /** + * map the HDT file, default value + */ + public static final String LOAD_HDT_TYPE_VALUE_MAP = "map"; + + /** + * Implementation of the temporary dictionary + */ + public static final String TEMP_DICTIONARY_IMPL_KEY = "tempDictionary.impl"; + /** + * use Hash map to create the HDT + */ + public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH = "hash"; + /** + * use Hash map to create the HDT and store the multisection dictionary, mandatory to create MSC + */ + public static final String TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH = "multHash"; + /** + * use Hash map with Prefix AND Suffix front-coded (PSFC), mandatory to create PSFC dictionary + */ + public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC = "hashPsfc"; + + /** + * Implementation of the dictionary + */ + public static final String DICTIONARY_TYPE_KEY = "dictionary.type"; + /** + * 4 Section dictionary + */ + public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION; + /** + * Prefix AND Suffix front-coded (PSFC) 4 Section dictionary + */ + public static final String DICTIONARY_TYPE_VALUE_FOUR_PSFC_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION; + /** + * big 4 Section dictionary + */ + public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG ="dictionaryFourBig"; + /** + * multi section dictionary + */ + public static final String DICTIONARY_TYPE_VALUE_MULTI_OBJECTS = "dictionaryMultiObj"; private HDTOptionsKeys() {} } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java index db61b8e8..6dae749d 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java @@ -17,354 +17,384 @@ * @author Antoine Willerval */ public abstract class RDFFluxStop { - private static final Map> BUILDER = new HashMap<>(); - private static final Map> BUILDER_OP = new HashMap<>(); - - static { - registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT, RDFFluxStop::countLimit); - registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE, RDFFluxStop::sizeLimit); - registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_NO_LIMIT, l -> noLimit()); - - registerCustomRDFFluxStopOperator('&', RDFFluxStop::and); - registerCustomRDFFluxStopOperator('|', RDFFluxStop::or); - } - - /** - * register a custom flux stop option for the {@link #readConfig(String)} method - * - * @param name name of the option - * @param builder builder - */ - public static void registerCustomRDFFluxStopConfig(String name, LongFunction builder) { - name.chars().forEach(c -> { - if (!Character.isJavaIdentifierPart(c)) { - throw new IllegalArgumentException("Config can't contain non identifier part! Found '" + c + "'"); - } - }); - BUILDER.put(name, builder); - } - - /** - * register a custom flux stop operator for the {@link #readConfig(String)} method - * - * @param operator operator character - * @param builder builder - */ - public static void registerCustomRDFFluxStopOperator(char operator, BiFunction builder) { - if (Character.isJavaIdentifierPart(operator) || operator == '(' || operator == ')') { - throw new IllegalArgumentException("Operator can't be an identifier part or a parenthesis! Found '" + operator + "'"); - } - BUILDER_OP.put(operator, builder); - } - - private static int searchNextParenthesis(String cfg, int start) { - int deep = 0; - for (int i = start; i < cfg.length(); i++) { - switch (cfg.charAt(i)) { - case '(': - deep++; - break; - case ')': - if (deep == 0) { - return i; - } - deep--; - } - } - - throw new IllegalArgumentException("Can't find next parenthesis for start " + start); - } - - /** - * read a config to a flux stop, grammar: - * - *

FluxStop: limiter:number | ( FluxStop ) | Operator | (empty)

- * - *

Operator: ( FluxStop ) op ( FluxStop )

- * - *

You can register limiter with the {@link #registerCustomRDFFluxStopConfig(String, LongFunction)} method

- * - *

You can register op with the {@link #registerCustomRDFFluxStopOperator(char, BiFunction)} method

- * - * @param cfg config string - * @param start start in the config string - * @param end end in the config string - * @return RDFFluxStop or null if no RDFFluxStop is present - * @see #readConfig(String) - */ - public static RDFFluxStop readConfig(String cfg, int start, int end) { - if (cfg == null) { - return null; - } - int i = start; - // current element for boolean operators - RDFFluxStop element = null; - while (i < end) { - char c = cfg.charAt(i++); - - if (c == '(') { // start of block - if (element != null) { - throw new IllegalArgumentException("Find an element after another one without having an operator! " + (i - 1)); - } - int next = searchNextParenthesis(cfg, i); - element = readConfig(cfg, i, next); - i = next + 1; - - } else if (c == ')') { // end of block, should be handled here - throw new IllegalArgumentException("Find closing parenthesis without opening! " + (i - 1)); - } else if (Character.isJavaIdentifierPart(c)) { // start of function - - // read key - int startElement = i - 1; - int j = i; - while (j < end) { - if (!Character.isJavaIdentifierPart(cfg.charAt(j))) { - break; - } - j++; - } - - if (j == end || cfg.charAt(j) != ':') { // no value for key - throw new IllegalArgumentException("Identifier without value: " + startElement); - } - - String key = cfg.substring(startElement, j); - - LongFunction builder = BUILDER.get(key); - - if (builder == null) { // key isn't a right config - throw new IllegalArgumentException("Can't find option: " + key); - } - - // read value - - startElement = j + 1; - if (startElement == end || !Character.isDigit(cfg.charAt(startElement))) { // not a number value - throw new IllegalArgumentException("Identifier without number value: " + key + ", " + startElement); - } - - j = startElement; - while (j < end) { - if (!Character.isDigit(cfg.charAt(j))) { - break; - } - j++; - } - long value = Long.parseLong(cfg.substring(startElement, j)); - - element = builder.apply(value); - i = j; - } else { - // read operator or throw error - BiFunction opFunc = BUILDER_OP.get(c); - - if (opFunc == null) { - throw new IllegalArgumentException("Unknow component: " + c + ", " + (i - 1)); - } - - if (element == null) { - throw new IllegalArgumentException("Find operator without element before! " + (i - 1)); - } - return opFunc.apply(element, readConfig(cfg, i, end)); - } - } - - return element; - } - - /** - * read a config to a flux stop, see {@link #readConfig(String, int, int)} for grammar - * - * @param cfg config string - * @return RDFFluxStop or null if no RDFFluxStop is present - * @see #readConfig(String, int, int) - */ - public static RDFFluxStop readConfig(String cfg) { - return cfg == null ? null : readConfig(cfg, 0, cfg.length()); - } - - /** - * @return basic implementation without any limit - */ - public static RDFFluxStop noLimit() { - return new RDFFluxStop() { - @Override - public boolean canHandle(TripleString ts) { - return true; - } - - @Override - public void restart() { - // nothing - } - - @Override - public String asConfig() { - return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_NO_LIMIT + ":0"; - } - }; - } - - /** - * implementation of flux stop stopping after a maximum triple count - * - * @param maxTriple maximum count - * @return FluxStop - */ - public static RDFFluxStop countLimit(long maxTriple) { - if (maxTriple <= 0) { - throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); - } - return new RDFFluxStop() { - long current = 0; - - @Override - public boolean canHandle(TripleString ts) { - return current++ < maxTriple; - } - - @Override - public void restart() { - current = 0; - } - - @Override - public String asConfig() { - return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":" + maxTriple; - } - }; - } - - /** - * implementation of flux stop stopping after a maximum NTriple size - * - * @param maxSize maximum size - * @return FluxStop - */ - public static RDFFluxStop sizeLimit(long maxSize) { - if (maxSize <= 0) { - throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); - } - return new RDFFluxStop() { - long size = 0; - - @Override - public boolean canHandle(TripleString ts) { - long tsSize; - try { - tsSize = ts.asNtriple().toString().getBytes(StandardCharsets.UTF_8).length; - } catch (IOException e) { - throw new RuntimeException("Can't estimate the size of the triple " + ts, e); - } - try { - return size < maxSize; - } finally { - size += tsSize; - } - } - - @Override - public void restart() { - size = 0; - } - - @Override - public String asConfig() { - return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":" + maxSize; - } - }; - } - - /** - * should we stop the flux after this triple or not? - * - * @param ts the triple - * @return true if the flux can handle this triple, false otherwise - */ - public abstract boolean canHandle(TripleString ts); - - /** - * restart the flux stop - */ - public abstract void restart(); - - /** - * @return config value for the {@link org.rdfhdt.hdt.options.HDTOptionsKeys#RDF_FLUX_STOP_KEY} option - */ - public abstract String asConfig(); - - @Override - public String toString() { - return asConfig(); - } - - @Override - public boolean equals(Object obj) { - if (obj == this) { - return true; - } - if (!(obj instanceof RDFFluxStop)) { - return false; - } - RDFFluxStop fluxStop = (RDFFluxStop) obj; - - return asConfig().equals(fluxStop.asConfig()); - } - - /** - * combine 2 rdf flux stop with a boolean operation, return this if fluxStop == null - * - * @param fluxStop the other flux stop - * @param stringOperator operator for the {@link #asConfig()} version - * @param operator the operator - * @return rdffluxstop - * @see #and(RDFFluxStop) - * @see #or(RDFFluxStop) - */ - public RDFFluxStop booleanOp(RDFFluxStop fluxStop, String stringOperator, BinaryOperator operator) { - if (fluxStop == null) { - return this; - } - return new RDFFluxStop() { - @Override - public boolean canHandle(TripleString ts) { - boolean left = RDFFluxStop.this.canHandle(ts); - boolean right = fluxStop.canHandle(ts); - return operator.apply(left, right); - } - - @Override - public void restart() { - RDFFluxStop.this.restart(); - fluxStop.restart(); - } - - @Override - public String asConfig() { - String left = RDFFluxStop.this.asConfig(); - String right = fluxStop.asConfig(); - return "(" + left + ")" + stringOperator + "(" + right + ")"; - } - }; - } - - /** - * {@link #booleanOp(RDFFluxStop, String, BinaryOperator)} version for AND - * - * @param fluxStop other flux stop - * @return rdffluxstop - */ - public RDFFluxStop and(RDFFluxStop fluxStop) { - return booleanOp(fluxStop, "&", (a, b) -> a && b); - } - - /** - * {@link #booleanOp(RDFFluxStop, String, BinaryOperator)} version for OR - * - * @param fluxStop other flux stop - * @return rdffluxstop - */ - public RDFFluxStop or(RDFFluxStop fluxStop) { - return booleanOp(fluxStop, "|", (a, b) -> a || b); - } + private static final Map> BUILDER = new HashMap<>(); + private static final Map> BUILDER_OP = new HashMap<>(); + + private static final RDFFluxStop EMPTY = new RDFFluxStop() { + @Override + public boolean canHandle(TripleString ts) { + return false; + } + + @Override + public void restart() { + } + + @Override + public String asConfig() { + return ""; + } + }; + + private static final RDFFluxStop NO_LIMIT = new RDFFluxStop() { + @Override + public boolean canHandle(TripleString ts) { + return true; + } + + @Override + public void restart() { + // nothing + } + + @Override + public String asConfig() { + return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_NO_LIMIT + ":0"; + } + }; + + static { + // default rdf flux stop config + registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT, RDFFluxStop::countLimit); + registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE, RDFFluxStop::sizeLimit); + registerCustomRDFFluxStopConfig(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_NO_LIMIT, l -> noLimit()); + + // default rdf flux stop boolean operations + registerCustomRDFFluxStopOperator(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_OP_AND, RDFFluxStop::and); + registerCustomRDFFluxStopOperator(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_OP_OR, RDFFluxStop::or); + registerCustomRDFFluxStopOperator(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_OP_NOT, (a, b) -> b.not()); + } + + /** + * register a custom flux stop option for the {@link #readConfig(String)} method + * + * @param name name of the option + * @param builder builder + */ + public static void registerCustomRDFFluxStopConfig(String name, LongFunction builder) { + name.chars().forEach(c -> { + if (!Character.isJavaIdentifierPart(c)) { + throw new IllegalArgumentException("Config can't contain non identifier part! Found '" + c + "'"); + } + }); + BUILDER.put(name, builder); + } + + /** + * register a custom flux stop operator for the {@link #readConfig(String)} method + * + * @param operator operator character + * @param builder builder + */ + public static void registerCustomRDFFluxStopOperator(char operator, BiFunction builder) { + if (Character.isJavaIdentifierPart(operator) || operator == '(' || operator == ')') { + throw new IllegalArgumentException("Operator can't be an identifier part or a parenthesis! Found '" + operator + "'"); + } + BUILDER_OP.put(operator, builder); + } + + private static int searchNextParenthesis(String cfg, int start) { + int deep = 0; + for (int i = start; i < cfg.length(); i++) { + switch (cfg.charAt(i)) { + case '(': + deep++; + break; + case ')': + if (deep == 0) { + return i; + } + deep--; + } + } + + throw new IllegalArgumentException("Can't find next parenthesis for start " + start); + } + + /** + * read a config to a flux stop, grammar: + * + *

FluxStop: limiter:number | ( FluxStop ) | Operator | (empty)

+ * + *

Operator: ( FluxStop ) op ( FluxStop )

+ * + *

You can register limiter with the {@link #registerCustomRDFFluxStopConfig(String, LongFunction)} method

+ * + *

You can register op with the {@link #registerCustomRDFFluxStopOperator(char, BiFunction)} method

+ * + * @param cfg config string + * @param start start in the config string + * @param end end in the config string + * @return RDFFluxStop or null if no RDFFluxStop is present + * @see #readConfig(String) + */ + public static RDFFluxStop readConfig(String cfg, int start, int end) { + if (cfg == null) { + return null; + } + int i = start; + // current element for boolean operators + RDFFluxStop element = null; + while (i < end) { + char c = cfg.charAt(i++); + + if (c == '(') { // start of block + if (element != null) { + throw new IllegalArgumentException("Find an element after another one without having an operator! " + (i - 1)); + } + int next = searchNextParenthesis(cfg, i); + element = readConfig(cfg, i, next); + i = next + 1; + + } else if (c == ')') { // end of block, should be handled here + throw new IllegalArgumentException("Find closing parenthesis without opening! " + (i - 1)); + } else if (Character.isJavaIdentifierPart(c)) { // start of function + + // read key + int startElement = i - 1; + int j = i; + while (j < end) { + if (!Character.isJavaIdentifierPart(cfg.charAt(j))) { + break; + } + j++; + } + + if (j == end || cfg.charAt(j) != ':') { // no value for key + throw new IllegalArgumentException("Identifier without value: " + startElement); + } + + String key = cfg.substring(startElement, j); + + LongFunction builder = BUILDER.get(key); + + if (builder == null) { // key isn't a right config + throw new IllegalArgumentException("Can't find option: " + key); + } + + // read value + + startElement = j + 1; + if (startElement == end || !Character.isDigit(cfg.charAt(startElement))) { // not a number value + throw new IllegalArgumentException("Identifier without number value: " + key + ", " + startElement); + } + + j = startElement; + while (j < end) { + if (!Character.isDigit(cfg.charAt(j))) { + break; + } + j++; + } + long value = Long.parseLong(cfg.substring(startElement, j)); + + element = builder.apply(value); + i = j; + } else { + // read operator or throw error + BiFunction opFunc = BUILDER_OP.get(c); + + if (opFunc == null) { + throw new IllegalArgumentException("Unknow component: " + c + ", " + (i - 1)); + } + + if (element == null) { + element = EMPTY; + } + return opFunc.apply(element, readConfig(cfg, i, end)); + } + } + + return element; + } + + /** + * read a config to a flux stop, see {@link #readConfig(String, int, int)} for grammar + * + * @param cfg config string + * @return RDFFluxStop or null if no RDFFluxStop is present + * @see #readConfig(String, int, int) + */ + public static RDFFluxStop readConfig(String cfg) { + return cfg == null ? null : readConfig(cfg, 0, cfg.length()); + } + + /** + * @return basic implementation without any limit + */ + public static RDFFluxStop noLimit() { + return NO_LIMIT; + } + + /** + * implementation of flux stop stopping after a maximum triple count + * + * @param maxTriple maximum count + * @return FluxStop + */ + public static RDFFluxStop countLimit(long maxTriple) { + if (maxTriple <= 0) { + throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); + } + return new RDFFluxStop() { + long current = 0; + + @Override + public boolean canHandle(TripleString ts) { + return current++ < maxTriple; + } + + @Override + public void restart() { + current = 0; + } + + @Override + public String asConfig() { + return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":" + maxTriple; + } + }; + } + + /** + * implementation of flux stop stopping after a maximum NTriple size + * + * @param maxSize maximum size + * @return FluxStop + */ + public static RDFFluxStop sizeLimit(long maxSize) { + if (maxSize <= 0) { + throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!"); + } + return new RDFFluxStop() { + long size = 0; + + @Override + public boolean canHandle(TripleString ts) { + long tsSize; + try { + tsSize = ts.asNtriple().toString().getBytes(StandardCharsets.UTF_8).length; + } catch (IOException e) { + throw new RuntimeException("Can't estimate the size of the triple " + ts, e); + } + try { + return size < maxSize; + } finally { + size += tsSize; + } + } + + @Override + public void restart() { + size = 0; + } + + @Override + public String asConfig() { + return HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":" + maxSize; + } + }; + } + + /** + * should we stop the flux after this triple or not? + * + * @param ts the triple + * @return true if the flux can handle this triple, false otherwise + */ + public abstract boolean canHandle(TripleString ts); + + /** + * restart the flux stop + */ + public abstract void restart(); + + /** + * @return config value for the {@link org.rdfhdt.hdt.options.HDTOptionsKeys#RDF_FLUX_STOP_KEY} option + */ + public abstract String asConfig(); + + @Override + public String toString() { + return asConfig(); + } + + @Override + public boolean equals(Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof RDFFluxStop)) { + return false; + } + RDFFluxStop fluxStop = (RDFFluxStop) obj; + + return asConfig().equals(fluxStop.asConfig()); + } + + /** + * combine 2 rdf flux stop with a boolean operation + * + * @param fluxStop the other flux stop + * @param charOperator operator for the {@link #asConfig()} version + * @param operator the operator + * @return rdffluxstop + * @see #and(RDFFluxStop) + * @see #or(RDFFluxStop) + */ + public RDFFluxStop booleanOp(RDFFluxStop fluxStop, char charOperator, BinaryOperator operator) { + if (fluxStop == null) { + return this; + } + return new RDFFluxStop() { + @Override + public boolean canHandle(TripleString ts) { + boolean left = RDFFluxStop.this.canHandle(ts); + boolean right = fluxStop.canHandle(ts); + return operator.apply(left, right); + } + + @Override + public void restart() { + RDFFluxStop.this.restart(); + fluxStop.restart(); + } + + @Override + public String asConfig() { + String left = RDFFluxStop.this.asConfig(); + String right = fluxStop.asConfig(); + return "(" + left + ")" + charOperator + "(" + right + ")"; + } + }; + } + + /** + * {@link #booleanOp(RDFFluxStop, char, BinaryOperator)} version for AND + * + * @param fluxStop other flux stop + * @return rdffluxstop + */ + public RDFFluxStop and(RDFFluxStop fluxStop) { + return booleanOp(fluxStop, HDTOptionsKeys.RDF_FLUX_STOP_VALUE_OP_AND, (a, b) -> a && b); + } + + /** + * {@link #booleanOp(RDFFluxStop, char, BinaryOperator)} version for OR + * + * @param fluxStop other flux stop + * @return rdffluxstop + */ + public RDFFluxStop or(RDFFluxStop fluxStop) { + return booleanOp(fluxStop, HDTOptionsKeys.RDF_FLUX_STOP_VALUE_OP_OR, (a, b) -> a || b); + } + + /** + * {@link #booleanOp(RDFFluxStop, char, BinaryOperator)} version for NOT + * + * @return rdffluxstop + */ + public RDFFluxStop not() { + return EMPTY.booleanOp(this, HDTOptionsKeys.RDF_FLUX_STOP_VALUE_OP_NOT, (a, b) -> !b); + } } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/TripleWriter.java b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/TripleWriter.java index ab93ad07..0a60bd07 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/TripleWriter.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/TripleWriter.java @@ -5,5 +5,5 @@ import org.rdfhdt.hdt.triples.TripleString; public interface TripleWriter extends AutoCloseable { - public void addTriple(TripleString str) throws IOException; + void addTriple(TripleString str) throws IOException; } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleID.java b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleID.java index 0feb8d0d..f21b9097 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleID.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleID.java @@ -156,7 +156,7 @@ public void clear() { */ @Override public String toString() { - return Long.toString(subject) + " " + predicate + " " + object; + return subject + " " + predicate + " " + object; } /** @@ -196,9 +196,7 @@ public boolean match(TripleID pattern) { /* Remember that 0 acts as a wildcard */ if (subjectPattern == 0 || this.subject == subjectPattern) { if (predicatePattern == 0 || this.predicate == predicatePattern) { - if (objectPattern == 0 || this.object == objectPattern) { - return true; - } + return objectPattern == 0 || this.object == objectPattern; } } return false; diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java index d581c8c6..c8317eb9 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java @@ -167,9 +167,7 @@ public boolean equals(Object other) { public boolean match(TripleString pattern) { if (pattern.getSubject().length() == 0 || equalsCharSequence(pattern.getSubject(), this.subject)) { if (pattern.getPredicate().length() == 0 || equalsCharSequence(pattern.getPredicate(), this.predicate)) { - if (pattern.getObject().length() == 0 || equalsCharSequence(pattern.getObject(), this.object)) { - return true; - } + return pattern.getObject().length() == 0 || equalsCharSequence(pattern.getObject(), this.object); } } return false; diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDT2RDF.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDT2RDF.java index 632c3f2a..3f452fa8 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDT2RDF.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDT2RDF.java @@ -27,6 +27,7 @@ package org.rdfhdt.hdt.tools; import java.io.PrintStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -58,11 +59,11 @@ public class HDT2RDF implements ProgressListener { public void execute() throws Exception { - PrintStream out = null; + PrintStream out; if (rdfOutput.equals("stdout")){ out = System.out; } else { - out = new PrintStream(rdfOutput, "UTF-8"); + out = new PrintStream(rdfOutput, StandardCharsets.UTF_8); } HDT hdt=HDTManager.mapHDT(hdtInput, this); @@ -91,6 +92,7 @@ public void notifyProgress(float level, String message) { //System.out.println(message + "\t"+ Float.toString(level)); } + @SuppressWarnings("deprecation") public static void main(String[] args) throws Throwable { HDT2RDF hdt2rdf = new HDT2RDF(); JCommander com = new JCommander(hdt2rdf, args); diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTCat.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTCat.java index cba34223..23184470 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTCat.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTCat.java @@ -23,6 +23,7 @@ import com.beust.jcommander.Parameter; import com.beust.jcommander.internal.Lists; +import org.apache.commons.io.FileUtils; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; @@ -65,7 +66,7 @@ public class HDTCat implements ProgressListener { @Parameter(names = "-quiet", description = "Do not show progress of the conversion") public boolean quiet; - public void execute() throws ParserException, IOException { + public void execute() throws IOException { HDTSpecification spec; if(configFile!=null) { @@ -79,12 +80,10 @@ public void execute() throws ParserException, IOException { File file = new File(hdtOutput); File theDir = new File(file.getAbsolutePath()+"_tmp"); - theDir.mkdirs(); + Files.createDirectories(theDir.toPath()); String location = theDir.getAbsolutePath()+"/"; - HDT hdt = HDTManager.catHDT(location,hdtInput1, hdtInput2 , spec,this); - - try { + try (HDT hdt = HDTManager.catHDT(location,hdtInput1, hdtInput2 , spec,this)) { // Show Basic stats if(!quiet){ System.out.println("Total Triples: "+hdt.getTriples().getNumberOfElements()); @@ -100,17 +99,15 @@ public void execute() throws ParserException, IOException { System.out.println("HDT saved to file in: "+sw.stopAndShow()); Files.delete(Paths.get(location+"dictionary")); Files.delete(Paths.get(location+"triples")); - theDir.delete(); + FileUtils.deleteDirectory(theDir); // Generate index and dump it to .hdt.index file sw.reset(); if(generateIndex) { - hdt = HDTManager.indexedHDT(hdt,this); + HDTManager.indexedHDT(hdt,this); System.out.println("Index generated and saved in: "+sw.stopAndShow()); } - } finally { - if(hdt!=null) hdt.close(); } // Debug all inserted triples @@ -123,10 +120,11 @@ public void execute() throws ParserException, IOException { @Override public void notifyProgress(float level, String message) { if(!quiet) { - System.out.print("\r"+message + "\t"+ Float.toString(level)+" \r"); + System.out.print("\r"+message + "\t"+ level +" \r"); } } + @SuppressWarnings("deprecation") public static void main(String[] args) throws Throwable { HDTCat hdtCat = new HDTCat(); System.out.println("Welcome to hdtCat!"); @@ -151,4 +149,4 @@ public static void main(String[] args) throws Throwable { hdtCat.execute(); } -} \ No newline at end of file +} diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index 379474b9..a527dc52 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -269,10 +269,11 @@ public void execute() throws ParserException, IOException { @Override public void notifyProgress(float level, String message) { if(!quiet) { - System.out.print("\r"+message + "\t"+ Float.toString(level)+" \r"); + System.out.print("\r"+message + "\t"+ level +" \r"); } } - + + @SuppressWarnings("deprecation") public static void main(String[] args) throws Throwable { RDF2HDT rdf2hdt = new RDF2HDT(); JCommander com = new JCommander(rdf2hdt, args); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCache.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCache.java index d31f8cb2..52714dee 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCache.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCache.java @@ -32,8 +32,8 @@ * */ public interface DictionaryCache { - public T get(long id); - public void put(long id, T node); - public int size(); - public void clear(); + T get(long id); + void put(long id, T node); + int size(); + void clear(); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheArray.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheArray.java index ec9213d5..a2ebb62b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheArray.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheArray.java @@ -37,13 +37,12 @@ */ public class DictionaryCacheArray implements DictionaryCache { - private Object array[]; + private Object[] array; final int capacity; - int numentries=0; + int numentries; public DictionaryCacheArray(int capacity) { array = null; - numentries=0; this.capacity=capacity; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheArrayWeak.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheArrayWeak.java index e0f6604d..434ceb59 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheArrayWeak.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheArrayWeak.java @@ -35,7 +35,7 @@ */ public class DictionaryCacheArrayWeak implements DictionaryCache { - private Reference array[]; + private final Reference[] array; @SuppressWarnings("unchecked") public DictionaryCacheArrayWeak(int capacity) { @@ -57,7 +57,7 @@ public T get(long id) { @Override public void put(long id, T node) { - array[(int) (id-1)] = new WeakReference(node); + array[(int) (id-1)] = new WeakReference<>(node); } @Override diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheHash.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheHash.java index b5e75820..d3a984a4 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheHash.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheHash.java @@ -35,7 +35,7 @@ */ public class DictionaryCacheHash implements DictionaryCache { - private Map hash = new ConcurrentHashMap<>(); + private final Map hash = new ConcurrentHashMap<>(); @Override public T get(long id) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheLRI.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheLRI.java index 1ece606e..94eeaae5 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheLRI.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheLRI.java @@ -39,15 +39,15 @@ */ public class DictionaryCacheLRI implements DictionaryCache { - private Map cache; - private long [] arr; + private final Map cache; + private final long [] arr; private int ptr=0; private final int size; public DictionaryCacheLRI(int size) { this.size = size; arr = new long[size]; - cache = new ConcurrentHashMap(size); + cache = new ConcurrentHashMap<>(size); } /* (non-Javadoc) diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheLRU.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheLRU.java index c78f1fe1..44bc4891 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheLRU.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/cache/DictionaryCacheLRU.java @@ -34,7 +34,7 @@ */ public class DictionaryCacheLRU implements DictionaryCache { - private LRUCache lru; + private final LRUCache lru; public DictionaryCacheLRU(int size) { lru = new LRUCache<>(size); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AdjacencyList.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AdjacencyList.java index a2ec0be3..a469d5b2 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AdjacencyList.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AdjacencyList.java @@ -39,8 +39,8 @@ public class AdjacencyList { private final Bitmap bitmap; /** - * @param array - * @param bitmap + * @param array array + * @param bitmap bitmap */ public AdjacencyList(Sequence array, Bitmap bitmap) { super(); @@ -232,7 +232,7 @@ public void dump() { System.out.println(","); } } - System.out.println(""); + System.out.println(); for (long i = 0; i < countListsX() && i < 100; i++) { System.out.print("List " + i + " ["); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/Bitmap375.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/Bitmap375.java index ef842b14..bdfe9e81 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/Bitmap375.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/Bitmap375.java @@ -436,7 +436,7 @@ public void load(InputStream input, ProgressListener listener) throws IOExceptio x: desired key n: size of the array */ - public int binarySearch(int arr[], long x, int n) { + public int binarySearch(int[] arr, long x, int n) { int i, j, m; i = 0; j = n; @@ -454,7 +454,7 @@ public int binarySearch(int arr[], long x, int n) { return i; } - public int binarySearch(long arr[], long x, int n) { + public int binarySearch(long[] arr, long x, int n) { int i, j, m; i = 0; j = n; @@ -471,4 +471,4 @@ public int binarySearch(long arr[], long x, int n) { return i; } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/Bitmap64Disk.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/Bitmap64Disk.java index 6c1ccef2..e53eaaeb 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/Bitmap64Disk.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/Bitmap64Disk.java @@ -129,7 +129,7 @@ public long selectNext1(long fromIndex) { while (true) { if (word != 0) - return ((long)wordIndex * W) + Long.numberOfTrailingZeros(word); + return (wordIndex * W) + Long.numberOfTrailingZeros(word); if (++wordIndex == words.length()) return -1; word = words.get(wordIndex); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java index dd9e90d4..dd5d5b7e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java @@ -71,7 +71,7 @@ public static long decode(InputStream in) throws IOException { while( (readbyte & 0x80)==0) { if(shift>=50) { // We read more bytes than required to load the max long - throw new IllegalArgumentException(); + throw new IllegalArgumentException("Read more bytes than required to load the max long"); } out |= (readbyte & 127) << shift; @@ -92,7 +92,7 @@ public static long decode(ByteBuffer in) throws IOException { while( (readbyte & 0x80)==0) { if(shift>=50) { // We read more bytes than required to load the max long - throw new IllegalArgumentException(); + throw new IllegalArgumentException("Read more bytes than required to load the max long"); } out |= (readbyte & 127L) << shift; @@ -114,7 +114,7 @@ public static long decode(BigMappedByteBuffer in) throws IOException { while( (readbyte & 0x80)==0) { if(shift>=50) { // We read more bytes than required to load the max long - throw new IllegalArgumentException(); + throw new IllegalArgumentException("Read more bytes than required to load the max long"); } out |= (readbyte & 127L) << shift; @@ -149,6 +149,7 @@ public static int decode(byte[] data, int offset, Mutable value) { int i=0; int shift=0; while( (0x80 & data[offset+i])==0) { + assert shift < 50 : "Read more bytes than required to load the max long"; out |= (data[offset+i] & 127L) << shift; i++; shift+=7; @@ -164,6 +165,7 @@ public static int decode(BigByteBuffer data, long offset, Mutable value) { int i = 0; int shift=0; while( (0x80 & data.get(offset+i))==0) { + assert shift < 50 : "Read more bytes than required to load the max long"; out |= (data.get(offset+i) & 127L) << shift; i++; shift+=7; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64Map.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64Map.java index 083a3dc0..b98c0b46 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64Map.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64Map.java @@ -65,10 +65,10 @@ public class SequenceLog64Map implements Sequence,Closeable { private static final long LONGS_PER_BUFFER=128*1024*1024; // 128*8 = 1Gb per chunk. private CloseMappedByteBuffer[] buffers; private FileChannel ch; - private int numbits; - private long numentries; + private final int numbits; + private final long numentries; private long lastword; - private long numwords; + private final long numwords; public SequenceLog64Map(File f) throws IOException { // Read from the beginning of the file diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryDiff.java index 202d683a..c6a7710c 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryDiff.java @@ -17,7 +17,7 @@ public interface DictionaryDiff extends Closeable { * @param listener listener to get the progress * @throws IOException io error */ - void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException; + void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException; /** * @return the CatMapping of the diff @@ -32,5 +32,5 @@ public interface DictionaryDiff extends Closeable { /** * @return the cat mapping for each section */ - HashMap getAllMappings(); + Map getAllMappings(); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java index 80892f46..81296c29 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java @@ -32,20 +32,45 @@ import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; +import java.nio.file.Path; +import java.util.Objects; + /** * Factory that creates Dictionary objects - * */ public class DictionaryFactory { - public static final String MOD_DICT_IMPL_HASH = "hash"; - public static final String MOD_DICT_IMPL_MULT_HASH = "multHash"; - public static final String MOD_DICT_IMPL_HASH_PSFC = "hashPsfc"; - public static final String DICTIONARY_TYPE_FOUR_SECTION_BIG ="dictionaryFourBig"; - public static final String DICTIONARY_TYPE_MULTI_OBJECTS = "dictionaryMultiObj"; - private DictionaryFactory() {} + /** + * @deprecated use {@link org.rdfhdt.hdt.options.HDTOptionsKeys#TEMP_DICTIONARY_IMPL_VALUE_HASH} instead + */ + @Deprecated + public static final String MOD_DICT_IMPL_HASH = HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH; + /** + * @deprecated use {@link org.rdfhdt.hdt.options.HDTOptionsKeys#TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH} instead + */ + @Deprecated + public static final String MOD_DICT_IMPL_MULT_HASH = HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH; + /** + * @deprecated use {@link org.rdfhdt.hdt.options.HDTOptionsKeys#TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC} instead + */ + @Deprecated + public static final String MOD_DICT_IMPL_HASH_PSFC = HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC; + /** + * @deprecated use {@link org.rdfhdt.hdt.options.HDTOptionsKeys#DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG} instead + */ + @Deprecated + public static final String DICTIONARY_TYPE_FOUR_SECTION_BIG = HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG; + /** + * @deprecated use {@link org.rdfhdt.hdt.options.HDTOptionsKeys#DICTIONARY_TYPE_VALUE_MULTI_OBJECTS} instead + */ + @Deprecated + public static final String DICTIONARY_TYPE_MULTI_OBJECTS = HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS; + + private DictionaryFactory() { + } /** * Creates a default dictionary (HashDictionary) @@ -58,65 +83,110 @@ public static Dictionary createDefaultDictionary() } /** - * Creates a default dictionary (HashDictionary) + * Creates a temp dictionary (allow insert) * - * @return Dictionary + * @param spec specs to read dictionary + * @return TempDictionary */ public static TempDictionary createTempDictionary(HDTOptions spec) { - String name = spec.get("tempDictionary.impl"); + String name = Objects.requireNonNullElse(spec.get(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY), ""); // Implementations available in the Core - if(name==null || "".equals(name) || MOD_DICT_IMPL_HASH.equals(name)) { - return new HashDictionary(spec,false); - } else if(MOD_DICT_IMPL_HASH_PSFC.equals(name)){ - return new PSFCTempDictionary(new HashDictionary(spec,false)); - } else if(MOD_DICT_IMPL_MULT_HASH.equals(name)){ - return new HashDictionary(spec,true); + switch (name) { + case "": + case HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH: + return new HashDictionary(spec, false); + case HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC: + return new PSFCTempDictionary(new HashDictionary(spec, false)); + case HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH: + return new HashDictionary(spec, true); + default: + throw new IllegalFormatException("Implementation of triples not found for " + name); } - throw new IllegalFormatException("Implementation of triples not found for "+name); } + /** + * Creates a dictionary + * + * @param spec specs to read dictionary + * @return Dictionary + */ public static DictionaryPrivate createDictionary(HDTOptions spec) { - String name = spec.get("dictionary.type"); - if(name==null || HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION.equals(name)) { - return new FourSectionDictionary(spec); - } - else if (HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION.equals(name)){ - return new PSFCFourSectionDictionary(spec); + String name = Objects.requireNonNullElse(spec.get(HDTOptionsKeys.DICTIONARY_TYPE_KEY), ""); + switch (name) { + case "": + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION: + return new FourSectionDictionary(spec); + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_PSFC_SECTION: + return new PSFCFourSectionDictionary(spec); + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG: + return new FourSectionDictionaryBig(spec); + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS: + return new MultipleSectionDictionary(spec); + default: + throw new IllegalFormatException("Implementation of dictionary not found for " + name); } - else if (DICTIONARY_TYPE_FOUR_SECTION_BIG.equals(name)){ - return new FourSectionDictionaryBig(spec); - }else if ((DICTIONARY_TYPE_MULTI_OBJECTS.equals(name))){ - return new MultipleSectionDictionary(spec); + } + + /** + * Creates a write-dictionary + * + * @param spec specs to read dictionary + * @param location write location + * @param bufferSize write buffer sizes + * @return WriteDictionary + */ + public static DictionaryPrivate createWriteDictionary(HDTOptions spec, Path location, int bufferSize) { + String name = Objects.requireNonNullElse(spec.get(HDTOptionsKeys.DICTIONARY_TYPE_KEY), ""); + switch (name) { + case "": + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION: + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG: + return new WriteFourSectionDictionary(spec, location, bufferSize); + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS: + return new WriteMultipleSectionDictionary(spec, location, bufferSize); + default: + throw new IllegalFormatException("Implementation of write dictionary not found for " + name); } - throw new IllegalFormatException("Implementation of dictionary not found for "+name); } + /** + * Creates a dictionary + * + * @param ci specs to read dictionary + * @return Dictionary + */ public static DictionaryPrivate createDictionary(ControlInfo ci) { String name = ci.getFormat(); - if(HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION.equals(name)) { - return new FourSectionDictionary(new HDTSpecification()); - } else if (HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION.equals(name)) { - return new PSFCFourSectionDictionary(new HDTSpecification()); - } else if(HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION.equals(name)){ - return new MultipleSectionDictionary(new HDTSpecification()); + switch (name) { + case HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION: + return new FourSectionDictionary(new HDTSpecification()); + case HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION: + return new PSFCFourSectionDictionary(new HDTSpecification()); + case HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION: + return new MultipleSectionDictionary(new HDTSpecification()); + default: + throw new IllegalFormatException("Implementation of dictionary not found for " + name); } - throw new IllegalFormatException("Implementation of dictionary not found for "+name); } /** * create a {@link DictionaryDiff} to create diff of a HDT in a new location + * * @param dictionary the hdt dictionary - * @param location the location of the new dictionary + * @param location the location of the new dictionary * @return dictionaryDiff */ public static DictionaryDiff createDictionaryDiff(Dictionary dictionary, String location) { String type = dictionary.getType(); - if (HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION.equals(type) || type.equals(HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION)) - return new FourSectionDictionaryDiff(location); - else if (type.equals(HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION)) - return new MultipleSectionDictionaryDiff(location); - - throw new IllegalFormatException("Implementation of DictionaryDiff not found for "+type); + switch (type) { + case HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION: + case HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION: + return new FourSectionDictionaryDiff(location); + case HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION: + return new MultipleSectionDictionaryDiff(location); + default: + throw new IllegalFormatException("Implementation of DictionaryDiff not found for " + type); + } } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionary.java index 08ff7f83..b7c055f8 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionary.java @@ -92,8 +92,8 @@ public interface TempDictionary extends Closeable { /** * Get the ID of a given String - * @param subject - * @param role + * @param subject subject + * @param role subject's role */ - public long stringToId(CharSequence subject, TripleComponentRole role); + long stringToId(CharSequence subject, TripleComponentRole role); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionarySection.java index d5914c70..51125b28 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionarySection.java @@ -27,7 +27,10 @@ */ +import org.rdfhdt.hdt.exceptions.NotImplementedException; + import java.util.Iterator; +import java.util.Map; /** * Interface that specifies the methods for a dictionary section @@ -70,4 +73,11 @@ public interface TempDictionarySection extends DictionarySection { */ Iterator getEntries(); + /** + * @return the literal counts for MultipleSectionDictionary + */ + default Map getLiteralsCounts() { + throw new NotImplementedException("getLiteralsCounts()"); + } + } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java index 9786f03e..f57504fc 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java @@ -224,7 +224,7 @@ public CharSequence idToString(long id, TripleComponentRole role) { return section.extract(localId); } @Override - public String dataTypeOfId(long id) { + public CharSequence dataTypeOfId(long id) { throw new IllegalArgumentException("Method is not applicable on this dictionary"); } @Override @@ -240,4 +240,4 @@ public long getNAllObjects() { public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { throw new NotImplementedException(); } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseTempDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseTempDictionary.java index f28c9cdd..9ec4ca2e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseTempDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseTempDictionary.java @@ -82,7 +82,7 @@ public long insert(CharSequence str, TripleComponentRole position) { public void reorganize() { // Generate shared - Iterator itSubj = ((TempDictionarySection)subjects).getEntries(); + Iterator itSubj = subjects.getEntries(); while(itSubj.hasNext()) { CharSequence str = itSubj.next(); @@ -93,7 +93,7 @@ public void reorganize() { } // Remove shared from subjects and objects - Iterator itShared = ((TempDictionarySection)shared).getEntries(); + Iterator itShared = shared.getEntries(); while(itShared.hasNext()) { CharSequence sharedStr = itShared.next(); subjects.remove(sharedStr); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java index b0084c7f..71eef73a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java @@ -1,6 +1,5 @@ package org.rdfhdt.hdt.dictionary.impl; -import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.dictionary.TempDictionarySection; import org.rdfhdt.hdt.dictionary.impl.section.OneReadDictionarySection; @@ -14,19 +13,19 @@ import org.rdfhdt.hdt.triples.IndexedNode; import org.rdfhdt.hdt.triples.TempTriples; import org.rdfhdt.hdt.util.concurrent.ExceptionThread; -import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.io.compress.CompressUtil; -import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.CompactString; +import org.rdfhdt.hdt.utils.DebugOrderNodeIterator; import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; import java.util.Comparator; +import java.util.function.Consumer; /** * Version of temp dictionary create the four sections from the SPO compressed sections result, should be loaded in a * async way with {@link org.rdfhdt.hdt.dictionary.DictionaryPrivate#loadAsync(org.rdfhdt.hdt.dictionary.TempDictionary, org.rdfhdt.hdt.listener.ProgressListener)} + * * @author Antoine Willerval */ public class CompressFourSectionDictionary implements TempDictionary { @@ -39,11 +38,13 @@ public class CompressFourSectionDictionary implements TempDictionary { private static void sendPiped(IndexedNode node, long index, PipedCopyIterator pipe, CompressUtil.DuplicatedIterator it, NodeConsumerMethod method) { it.setLastHeader(index); method.consume(node.getIndex(), index); - pipe.addElement(node.getNode().toString()); + pipe.addElement(new CompactString(node.getNode())); } - public CompressFourSectionDictionary(CompressionResult compressionResult, NodeConsumer nodeConsumer, ProgressListener listener) { + public CompressFourSectionDictionary(CompressionResult compressionResult, NodeConsumer nodeConsumer, ProgressListener listener, boolean debugOrder) { long splits = Math.max(20, compressionResult.getTripleCount() / 10_000); + Consumer debugOrderCheckerS = DebugOrderNodeIterator.of(debugOrder, "Subject"); + Consumer debugOrderCheckerO = DebugOrderNodeIterator.of(debugOrder, "Object"); // send duplicate to the consumer while reading the nodes CompressUtil.DuplicatedIterator sortedSubject = CompressUtil.asNoDupeCharSequenceIterator( @@ -89,55 +90,70 @@ public CompressFourSectionDictionary(CompressionResult compressionResult, NodeCo PipedCopyIterator shared = new PipedCopyIterator<>(); Comparator comparator = CharSequenceComparator.getInstance(); cfsdThread = new ExceptionThread(() -> { - long sharedId = 1; - long subjectId = 1; - long objectId = 1; - sharedLoop: - while (sortedObject.hasNext() && sortedSubject.hasNext()) { - // last was a shared node - IndexedNode newSubject = sortedSubject.next(); - IndexedNode newObject = sortedObject.next(); - int comp = comparator.compare(newSubject.getNode(), newObject.getNode()); - while (comp != 0) { - if (comp < 0) { - sendPiped(newSubject, CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); - if (!sortedSubject.hasNext()) { - // no more subjects, send the current object and break the shared loop - sendPiped(newObject, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); - break sharedLoop; - } - newSubject = sortedSubject.next(); - } else { - sendPiped(newObject, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); - if (!sortedObject.hasNext()) { - // no more objects, send the current subject and break the shared loop + try { + long sharedId = 1; + long subjectId = 1; + long objectId = 1; + sharedLoop: + while (sortedObject.hasNext() && sortedSubject.hasNext()) { + // last was a shared node + IndexedNode newSubject = sortedSubject.next(); + IndexedNode newObject = sortedObject.next(); + debugOrderCheckerS.accept(newSubject); + debugOrderCheckerO.accept(newObject); + int comp = comparator.compare(newSubject.getNode(), newObject.getNode()); + while (comp != 0) { + if (comp < 0) { sendPiped(newSubject, CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); - break sharedLoop; + if (!sortedSubject.hasNext()) { + // no more subjects, send the current object and break the shared loop + sendPiped(newObject, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + break sharedLoop; + } + newSubject = sortedSubject.next(); + debugOrderCheckerS.accept(newSubject); + } else { + sendPiped(newObject, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + if (!sortedObject.hasNext()) { + // no more objects, send the current subject and break the shared loop + sendPiped(newSubject, CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); + break sharedLoop; + } + newObject = sortedObject.next(); + debugOrderCheckerO.accept(newObject); } - newObject = sortedObject.next(); + comp = comparator.compare(newSubject.getNode(), newObject.getNode()); } - comp = comparator.compare(newSubject.getNode(), newObject.getNode()); + // shared element + long shid = CompressUtil.asShared(sharedId++); + sortedSubject.setLastHeader(shid); + sortedObject.setLastHeader(shid); + nodeConsumer.onSubject(newSubject.getIndex(), shid); + nodeConsumer.onObject(newObject.getIndex(), shid); + shared.addElement(new CompactString(newSubject.getNode())); } - // shared element - long shid = CompressUtil.asShared(sharedId++); - sortedSubject.setLastHeader(shid); - sortedObject.setLastHeader(shid); - nodeConsumer.onSubject(newSubject.getIndex(), shid); - nodeConsumer.onObject(newObject.getIndex(), shid); - shared.addElement(newSubject.getNode().toString()); - } - // at least one iterator is empty, closing the shared pipe - shared.closePipe(); - // do we have subjects? - while (sortedSubject.hasNext()) { - sendPiped(sortedSubject.next(), CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); - } - subject.closePipe(); - // do we have objects? - while (sortedObject.hasNext()) { - sendPiped(sortedObject.next(), CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + // at least one iterator is empty, closing the shared pipe + shared.closePipe(); + // do we have subjects? + while (sortedSubject.hasNext()) { + IndexedNode next = sortedSubject.next(); + debugOrderCheckerS.accept(next); + sendPiped(next, CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); + } + subject.closePipe(); + // do we have objects? + while (sortedObject.hasNext()) { + IndexedNode next = sortedObject.next(); + debugOrderCheckerO.accept(next); + sendPiped(next, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + } + object.closePipe(); + } catch (Throwable t) { + object.closePipe(t); + subject.closePipe(t); + shared.closePipe(t); + throw t; } - object.closePipe(); }, "CFSDPipeBuilder").startAll(); // send to the consumer the element while parsing them @@ -147,7 +163,7 @@ public CompressFourSectionDictionary(CompressionResult compressionResult, NodeCo sortedPredicate.setLastHeader(header); nodeConsumer.onPredicate(node.getIndex(), header); // force duplication because it's not made in a pipe like with the others - return node.getNode().toString(); + return new CompactString(node.getNode()); }), predicates); this.object = new OneReadDictionarySection(object, objects); this.shared = new OneReadDictionarySection(shared, shareds); @@ -222,7 +238,9 @@ public void close() throws IOException { public interface NodeConsumer { void onSubject(long preMapId, long newMapId); + void onPredicate(long preMapId, long newMapId); + void onObject(long preMapId, long newMapId); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java index 5a406cb9..23f6fe0e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java @@ -15,10 +15,8 @@ import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.util.io.IOUtil; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; @@ -33,7 +31,7 @@ public class FourSectionDictionaryDiff implements DictionaryDiff { private final String location; - private final HashMap allMappings = new HashMap<>(); + private final Map allMappings = new HashMap<>(); private CatMapping mappingBack; public long numShared; @@ -51,7 +49,7 @@ public void close() throws IOException { } } @Override - public void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException { + public void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException { allMappings.put("predicate", new CatMapping(location, "predicate", dictionary.getPredicates().getNumberOfElements())); allMappings.put("subject", new CatMapping(location, "subject", dictionary.getSubjects().getNumberOfElements())); allMappings.put("object", new CatMapping(location, "object", dictionary.getObjects().getNumberOfElements())); @@ -232,7 +230,7 @@ public int count() { } @Override - public HashMap getAllMappings() { + public Map getAllMappings() { return allMappings; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/HashDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/HashDictionary.java index 6f41f504..a56652e0 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/HashDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/HashDictionary.java @@ -45,7 +45,7 @@ */ public class HashDictionary extends BaseTempDictionary { - boolean isCustom = false; + boolean isCustom; public HashDictionary(HDTOptions spec,boolean isCustom) { super(spec); this.isCustom = isCustom; @@ -68,7 +68,7 @@ public void reorganize(TempTriples triples) { StopWatch st = new StopWatch(); // Generate old subject mapping - Iterator itSubj = ((TempDictionarySection) subjects).getEntries(); + Iterator itSubj = subjects.getEntries(); while(itSubj.hasNext()) { CharSequence str = itSubj.next(); mapSubj.add(str); @@ -82,21 +82,21 @@ public void reorganize(TempTriples triples) { // Generate old predicate mapping st.reset(); - Iterator itPred = ((TempDictionarySection) predicates).getEntries(); + Iterator itPred = predicates.getEntries(); while(itPred.hasNext()) { CharSequence str = itPred.next(); mapPred.add(str); } // Generate old object mapping - Iterator itObj = ((TempDictionarySection) objects).getEntries(); + Iterator itObj = objects.getEntries(); while(itObj.hasNext()) { CharSequence str = itObj.next(); mapObj.add(str); } // Remove shared from subjects and objects - Iterator itShared = ((TempDictionarySection) shared).getEntries(); + Iterator itShared = shared.getEntries(); while(itShared.hasNext()) { CharSequence sharedStr = itShared.next(); subjects.remove(sharedStr); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java index efb55009..3e552bf0 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java @@ -1,9 +1,11 @@ package org.rdfhdt.hdt.dictionary.impl; +import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySectionMap; import org.rdfhdt.hdt.dictionary.impl.section.PFCOptimizedExtractor; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; import java.util.AbstractMap; import java.util.Iterator; @@ -12,7 +14,7 @@ public class MultDictionaryPFCOptimizedExtractor implements OptimizedExtractor{ private final PFCOptimizedExtractor shared, subjects, predicates; - private final TreeMap objects; + private final TreeMap objects; private final long numshared; public MultDictionaryPFCOptimizedExtractor(MultipleSectionDictionary origDict) { @@ -20,11 +22,9 @@ public MultDictionaryPFCOptimizedExtractor(MultipleSectionDictionary origDict) { shared = new PFCOptimizedExtractor((PFCDictionarySectionMap) origDict.shared); subjects = new PFCOptimizedExtractor((PFCDictionarySectionMap) origDict.subjects); predicates = new PFCOptimizedExtractor((PFCDictionarySectionMap) origDict.predicates); - objects = new TreeMap<>(); - Iterator iterator = origDict.getAllObjects().entrySet().iterator(); - while (iterator.hasNext()){ - Map.Entry entry = (Map.Entry)iterator.next(); - objects.put((String)entry.getKey(),new PFCOptimizedExtractor((PFCDictionarySectionMap)entry.getValue())); + objects = new TreeMap<>(CharSequenceComparator.getInstance()); + for (Map.Entry entry : origDict.getAllObjects().entrySet()) { + objects.put(entry.getKey(), new PFCOptimizedExtractor((PFCDictionarySectionMap) entry.getValue())); } } @@ -32,7 +32,7 @@ public MultDictionaryPFCOptimizedExtractor(MultipleSectionDictionary origDict) { public CharSequence idToString(long id, TripleComponentRole role) { AbstractMap.SimpleEntry section = getSection(id, role); long localId = getLocalId(id, role); - if(section.getKey().equals("NO_DATATYPE") || section.getKey().equals("section")) + if(section.getKey().equals(LiteralsUtils.NO_DATATYPE_STR) || section.getKey().equals("section")) return section.getValue().extract(localId); else { String label = section.getValue().extract(localId).toString(); @@ -59,18 +59,18 @@ private AbstractMap.SimpleEntry getSection(long id if(id<= numshared) { return new AbstractMap.SimpleEntry<>("section",shared); } else { - Iterator hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section PFCOptimizedExtractor desiredSection = null; String type = ""; int count = 0; - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - PFCOptimizedExtractor subSection = (PFCOptimizedExtractor)entry.getValue(); + while (hmIterator.hasNext()) { + Map.Entry entry = hmIterator.next(); + PFCOptimizedExtractor subSection = entry.getValue(); count+= subSection.getNumStrings(); if(id <= numshared+count){ desiredSection = subSection; - type = (String)entry.getKey(); + type = entry.getKey().toString(); break; } } @@ -92,14 +92,14 @@ private long getLocalId(long id, TripleComponentRole position) { if(id<=numshared) { return id; } else { - Iterator hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section long count = 0; while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - PFCOptimizedExtractor subSection = (PFCOptimizedExtractor)entry.getValue(); + Map.Entry entry = hmIterator.next(); + PFCOptimizedExtractor subSection = entry.getValue(); count+= subSection.getNumStrings(); - if(id <= numshared+ count){ + if(id <= numshared + count){ count -= subSection.getNumStrings(); break; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java index 25175178..15f364e7 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java @@ -5,47 +5,47 @@ import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; import org.rdfhdt.hdt.dictionary.TempDictionary; -import org.rdfhdt.hdt.dictionary.impl.section.PFCOptimizedExtractor; import org.rdfhdt.hdt.enums.DictionarySectionRole; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CompactString; -import org.rdfhdt.hdt.util.string.DelayedString; import java.util.AbstractMap; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; -import java.util.regex.Pattern; public abstract class MultipleBaseDictionary implements DictionaryPrivate { + private static final CharSequence SECTION = new CompactString("section"); protected final HDTOptions spec; protected DictionarySectionPrivate subjects; protected DictionarySectionPrivate predicates; - protected TreeMap objects; + protected TreeMap objects; protected DictionarySectionPrivate shared; public MultipleBaseDictionary(HDTOptions spec) { this.spec = spec; } - protected long getGlobalId(long id, DictionarySectionRole position,CharSequence str) { + protected long getGlobalId(long id, DictionarySectionRole position, CharSequence str) { switch (position) { case SUBJECT: return id + shared.getNumberOfElements(); case OBJECT: { - Iterator iter = objects.entrySet().iterator(); + Iterator> iter = objects.entrySet().iterator(); int count = 0; - while (iter.hasNext()){ - Map.Entry entry = (Map.Entry)iter.next(); - count+= ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); - if(LiteralsUtils.getType(str).equals((String)entry.getKey())){ - count -= ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); + CharSequence type = LiteralsUtils.getType(ByteStringUtil.asByteString(str)); + while (iter.hasNext()) { + Map.Entry entry = iter.next(); + count+= entry.getValue().getNumberOfElements(); + if(type.equals(entry.getKey())) { + count -= entry.getValue().getNumberOfElements(); break; } @@ -76,18 +76,21 @@ protected long getLocalId(long id, TripleComponentRole position) { if(id<=shared.getNumberOfElements()) { return id; } else { - Iterator hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section long count = 0; - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - long numElts = 0; - if(entry.getValue() instanceof DictionarySectionPrivate) - numElts = ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); - else if(entry.getValue() instanceof PFCOptimizedExtractor) - numElts = ((PFCOptimizedExtractor)entry.getValue()).getNumStrings(); - count+= numElts; - if(id <= shared.getNumberOfElements()+ count){ + while (hmIterator.hasNext()) { + Map.Entry entry = hmIterator.next(); + long numElts; + + //what??? + //if (entry.getValue() instanceof PFCOptimizedExtractor) { + // numElts = ((PFCOptimizedExtractor)entry.getValue()).getNumStrings(); + //} else { + numElts = entry.getValue().getNumberOfElements(); + //} + count += numElts; + if(id <= shared.getNumberOfElements() + count){ count -= numElts; break; } @@ -106,18 +109,13 @@ else if(entry.getValue() instanceof PFCOptimizedExtractor) */ @Override public long stringToId(CharSequence str, TripleComponentRole position) { - str = DelayedString.unwrap(str); + str = ByteStringUtil.asByteString(str); - if(str==null || str.length()==0) { + if (str == null || str.length() == 0) { return 0; } - if(str instanceof String) { - // CompactString is more efficient for the binary search. - str = new CompactString(str); - } - - long ret=0; + long ret; switch(position) { case SUBJECT: ret = shared.locate(str); @@ -136,19 +134,20 @@ public long stringToId(CharSequence str, TripleComponentRole position) { } return -1; case OBJECT: - if(str.charAt(0)!='"') { + if (str.charAt(0)!='"') { ret = shared.locate(str); if(ret!=0) { return getGlobalId(ret, DictionarySectionRole.SHARED,str); } } DictionarySectionPrivate subSection = getSubSection(str); - if( subSection!= null) - ret = subSection.locate(new CompactString(LiteralsUtils.removeType(str))); - else + if (subSection!= null) { + ret = subSection.locate(LiteralsUtils.removeType(str)); + } else { return -1; - if(ret!=0) { - return getGlobalId(ret, DictionarySectionRole.OBJECT,str); + } + if (ret != 0) { + return getGlobalId(ret, DictionarySectionRole.OBJECT, str); } return -1; default: @@ -157,15 +156,8 @@ public long stringToId(CharSequence str, TripleComponentRole position) { } private long getNumberObjectsAllSections(){ - Iterator hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section - long total = 0; - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - DictionarySectionPrivate subSection = (DictionarySectionPrivate) entry.getValue(); - total += subSection.getNumberOfElements(); - } - return total; + return objects.values().stream().mapToLong(DictionarySection::getNumberOfElements).sum(); } @Override public long getNumberOfElements() { @@ -175,7 +167,10 @@ public long getNumberOfElements() { @Override public long size() { - return subjects.size()+predicates.size()+objects.size()+shared.size(); + return subjects.size() + + predicates.size() + + objects.values().stream().mapToLong(DictionarySection::size).sum() + + shared.size(); } @Override @@ -209,7 +204,7 @@ public DictionarySection getPredicates() { } @Override - public TreeMap getAllObjects() { + public Map getAllObjects() { return new TreeMap<>(this.objects); } @@ -223,33 +218,33 @@ public DictionarySection getShared() { return shared; } - private AbstractMap.SimpleEntry getSection(long id, TripleComponentRole role) { + private AbstractMap.SimpleEntry getSection(long id, TripleComponentRole role) { switch (role) { case SUBJECT: if(id<=shared.getNumberOfElements()) { - return new AbstractMap.SimpleEntry<>("section",shared); + return new AbstractMap.SimpleEntry<>(SECTION,shared); } else { - return new AbstractMap.SimpleEntry<>("section",subjects); + return new AbstractMap.SimpleEntry<>(SECTION,subjects); } case PREDICATE: - return new AbstractMap.SimpleEntry<>("section",predicates); + return new AbstractMap.SimpleEntry<>(SECTION,predicates); case OBJECT: if(id<=shared.getNumberOfElements()) { - return new AbstractMap.SimpleEntry<>("section",shared); + return new AbstractMap.SimpleEntry<>(SECTION,shared); } else { - Iterator hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section DictionarySectionPrivate desiredSection = null; - String type = ""; + CharSequence type = CompactString.EMPTY; int count = 0; while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - DictionarySectionPrivate subSection = (DictionarySectionPrivate)entry.getValue(); + Map.Entry entry = hmIterator.next(); + DictionarySectionPrivate subSection = entry.getValue(); count += subSection.getNumberOfElements(); if(id <= shared.getNumberOfElements()+ count){ desiredSection = subSection; - type = (String)entry.getKey(); + type = entry.getKey(); break; } } @@ -259,24 +254,23 @@ private AbstractMap.SimpleEntry getSection(long throw new IllegalArgumentException(); } } - static Pattern pattern = Pattern.compile("@[a-zA-Z0-9\\-]+$"); /* (non-Javadoc) * @see hdt.dictionary.Dictionary#idToString(int, datatypes.TripleComponentRole) */ @Override public CharSequence idToString(long id, TripleComponentRole role) { - AbstractMap.SimpleEntry section = getSection(id, role); + AbstractMap.SimpleEntry section = getSection(id, role); long localId = getLocalId(id, role); - if(section.getKey().equals("NO_DATATYPE") || section.getKey().equals("section")) + if(section.getKey().equals(LiteralsUtils.NO_DATATYPE) || section.getKey().equals(SECTION)) return section.getValue().extract(localId); else { - if(section.getValue() == null){ + if(section.getValue() == null) { // this should not happen, means that the given id wasn't found in any section System.out.println("Error couldn't find the section for the given ID: ["+id+"]"); return null; }else { - String label = section.getValue().extract(localId).toString(); - String dType = section.getKey(); + CharSequence label = section.getValue().extract(localId); + CharSequence dType = section.getKey(); //Matcher matcher = pattern.matcher(label); if (LiteralsUtils.containsLanguage(label)) { return label; @@ -287,47 +281,36 @@ public CharSequence idToString(long id, TripleComponentRole role) { } } private DictionarySectionPrivate getSubSection(CharSequence str){ - String dataType = ""; -// if(str.toString().startsWith("\"")) { -// if(str.toString().matches("\".*\"\\^\\^<.*>")){ -// dataType = str.toString().split("\\^")[2]; -// }else{ -// dataType = "NO_DATATYPE"; -// } -// }else{ -// dataType = "NO_DATATYPE"; -// } - dataType = LiteralsUtils.getType(str); - return objects.get(dataType); + return objects.get(LiteralsUtils.getType(str)); } @Override - public String dataTypeOfId(long id) { + public CharSequence dataTypeOfId(long id) { return getSection(id,TripleComponentRole.OBJECT).getKey(); } - public AbstractMap.SimpleEntry getDataTypeRange(String dataType){ - if(!dataType.equals("NO_DATATYPE")) - dataType = "<"+dataType+">"; - if(objects.containsKey(dataType)) { // literals subsection exist - Iterator iter = objects.entrySet().iterator(); + + public AbstractMap.SimpleEntry getDataTypeRange(CharSequence dataType){ + CharSequence seq = LiteralsUtils.embed(ByteStringUtil.asByteString(dataType)); + if(objects.containsKey(seq)) { // literals subsection exist + Iterator> iter = objects.entrySet().iterator(); int count = 0; while (iter.hasNext()) { - Map.Entry entry = (Map.Entry) iter.next(); - count += ((DictionarySectionPrivate) entry.getValue()).getNumberOfElements(); - if (dataType.equals((String) entry.getKey())) { - count -= ((DictionarySectionPrivate) entry.getValue()).getNumberOfElements(); + Map.Entry entry = iter.next(); + count += entry.getValue().getNumberOfElements(); + if (seq.equals(entry.getKey())) { + count -= entry.getValue().getNumberOfElements(); break; } } long offset = shared.getNumberOfElements() + count; - long size = offset + objects.get(dataType).getNumberOfElements(); + long size = offset + objects.get(seq).getNumberOfElements(); return new AbstractMap.SimpleEntry<>(offset +1, size); } return new AbstractMap.SimpleEntry<>(0L,0L); } @Override - public void loadAsync(TempDictionary other, ProgressListener listener) { + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { throw new NotImplementedException(); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java index 67f4bf69..4f891d8f 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java @@ -1,10 +1,10 @@ package org.rdfhdt.hdt.dictionary.impl; +import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory; -import org.rdfhdt.hdt.dictionary.impl.section.HashDictionarySection; import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySection; import org.rdfhdt.hdt.exceptions.IllegalFormatException; import org.rdfhdt.hdt.exceptions.NotImplementedException; @@ -19,221 +19,217 @@ import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CompactString; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; public class MultipleSectionDictionary extends MultipleBaseDictionary { - public MultipleSectionDictionary(HDTOptions spec) { - super(spec); - // FIXME: Read type from spec. - subjects = new PFCDictionarySection(spec); - predicates = new PFCDictionarySection(spec); - objects = new TreeMap(); - shared = new PFCDictionarySection(spec); - } - - /* (non-Javadoc) - * @see hdt.dictionary.Dictionary#load(hdt.dictionary.Dictionary) - */ - @Override - public void load(TempDictionary other, ProgressListener listener) { - IntermediateListener iListener = new IntermediateListener(listener); - subjects.load(other.getSubjects(), iListener); - predicates.load(other.getPredicates(), iListener); - Iterator iter = other.getObjects().getEntries(); - - // TODO: allow the usage of OneReadDictionarySection - HashMap literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); - if(literalsCounts.containsKey("NO_DATATYPE")) - literalsCounts.put("NO_DATATYPE",literalsCounts.get("NO_DATATYPE") - other.getShared().getNumberOfElements()); - CustomIterator customIterator = new CustomIterator(iter,literalsCounts); - long startTime = System.currentTimeMillis(); - while (customIterator.hasNext()){ - PFCDictionarySection section = new PFCDictionarySection(spec); - String type = LiteralsUtils.getType(customIterator.prev); - long numEntries = literalsCounts.get(type); - - section.load(customIterator,numEntries,listener); - long locate = section.locate(new CompactString("\"\uD83C\uDDEB\uD83C\uDDF7\"@ro")); - objects.put(type,section); - } - long endTime = System.currentTimeMillis(); - //System.out.println("Loaded objects subsections in: "+(endTime - startTime)+" ms"); - shared.load(other.getShared(), iListener); - } - - /* (non-Javadoc) - * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) - */ - @Override - public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { - ci.setType(ControlInfo.Type.DICTIONARY); - ci.setFormat(getType()); - ci.setInt("elements", this.getNumberOfElements()); - ci.save(output); - - IntermediateListener iListener = new IntermediateListener(listener); - shared.save(output, iListener); - subjects.save(output, iListener); - predicates.save(output, iListener); - - writeLiteralsMap(output, iListener); - - } - /* - ------------------ - |len| Literal URI| - ------------------ - */ - private void writeLiteralsMap(OutputStream output,ProgressListener listener) throws IOException { - Iterator hmIterator = objects.entrySet().iterator(); - int numberOfTypes = objects.size(); - output.write(numberOfTypes); - - ArrayList types = new ArrayList<>(); - - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - String uri = (String)entry.getKey(); - output.write(uri.length()); - IOUtil.writeBuffer(output, uri.getBytes(), 0, uri.getBytes().length, listener); - types.add(uri); - } - for(String type:types){ - this.objects.get(type).save(output,listener); - } - } - private void readLiteralsMap(InputStream input,ProgressListener listener) throws IOException { - int numberOfTypes = input.read(); - ArrayList types = new ArrayList<>(); - for (int i = 0; i < numberOfTypes; i++) { - int length = input.read(); - byte[] type = IOUtil.readBuffer(input, length, listener); - types.add(new String(type)); - } - for(String type:types){ - this.objects.put(type,DictionarySectionFactory.loadFrom(input,listener)); - } - } - private void mapLiteralsMap(CountInputStream input,File f,ProgressListener listener) throws IOException { - int numberOfTypes = input.read(); - ArrayList types = new ArrayList<>(); - for (int i = 0; i < numberOfTypes; i++) { - int length = input.read(); - byte[] type = IOUtil.readBuffer(input, length, listener); - String typeStr = new String(type); - types.add(typeStr); - } - for(String type:types){ - this.objects.put(type,DictionarySectionFactory.loadFrom(input,f,listener)); - } - - } - - - /* (non-Javadoc) - * @see hdt.dictionary.Dictionary#load(java.io.InputStream) - */ - @Override - public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { - if(ci.getType()!=ControlInfo.Type.DICTIONARY) { - throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); - } - - IntermediateListener iListener = new IntermediateListener(listener); - - shared = DictionarySectionFactory.loadFrom(input, iListener); - subjects = DictionarySectionFactory.loadFrom(input, iListener); - predicates = DictionarySectionFactory.loadFrom(input, iListener); - - readLiteralsMap(input,listener); - } - - @Override - public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { - ControlInformation ci = new ControlInformation(); - ci.load(in); - if(ci.getType()!=ControlInfo.Type.DICTIONARY) { - throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); - } - - IntermediateListener iListener = new IntermediateListener(listener); - shared = DictionarySectionFactory.loadFrom(in, f, iListener); - subjects = DictionarySectionFactory.loadFrom(in, f, iListener); - predicates = DictionarySectionFactory.loadFrom(in, f, iListener); - - mapLiteralsMap(in,f,listener); - - // Use cache only for predicates. Preload only up to 100K predicates. - // FIXME: DISABLED + public MultipleSectionDictionary(HDTOptions spec) { + super(spec); + // FIXME: Read type from spec. + subjects = new PFCDictionarySection(spec); + predicates = new PFCDictionarySection(spec); + objects = new TreeMap<>(CharSequenceComparator.getInstance()); + shared = new PFCDictionarySection(spec); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#load(hdt.dictionary.Dictionary) + */ + @Override + public void load(TempDictionary other, ProgressListener listener) { + IntermediateListener iListener = new IntermediateListener(listener); + subjects.load(other.getSubjects(), iListener); + predicates.load(other.getPredicates(), iListener); + Iterator iter = other.getObjects().getEntries(); + + // TODO: allow the usage of OneReadDictionarySection + Map literalsCounts = new HashMap<>(other.getObjects().getLiteralsCounts()); + literalsCounts.computeIfPresent(LiteralsUtils.NO_DATATYPE, (key, value) -> (value - other.getShared().getNumberOfElements())); + CustomIterator customIterator = new CustomIterator(iter, literalsCounts); + while (customIterator.hasNext()) { + PFCDictionarySection section = new PFCDictionarySection(spec); + CharSequence type = LiteralsUtils.getType(customIterator.prev); + long numEntries = literalsCounts.get(type); + + section.load(customIterator, numEntries, listener); + section.locate(new CompactString("\"\uD83C\uDDEB\uD83C\uDDF7\"@ro")); + objects.put(type, section); + } + shared.load(other.getShared(), iListener); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) + */ + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + ci.setType(ControlInfo.Type.DICTIONARY); + ci.setFormat(getType()); + ci.setInt("elements", this.getNumberOfElements()); + ci.save(output); + + IntermediateListener iListener = new IntermediateListener(listener); + shared.save(output, iListener); + subjects.save(output, iListener); + predicates.save(output, iListener); + + writeLiteralsMap(output, iListener); + + } + + /* + ------------------ + |len| Literal URI| + ------------------ + */ + private void writeLiteralsMap(OutputStream output, ProgressListener listener) throws IOException { + Iterator> hmIterator = objects.entrySet().iterator(); + int numberOfTypes = objects.size(); + VByte.encode(output, numberOfTypes); + + List types = new ArrayList<>(); + + while (hmIterator.hasNext()) { + Map.Entry entry = hmIterator.next(); + CharSequence uri = entry.getKey(); + String uriStr = uri.toString(); + byte[] bytes = uriStr.getBytes(); + VByte.encode(output, bytes.length); + IOUtil.writeBuffer(output, bytes, 0, bytes.length, listener); + types.add(uri); + } + for (CharSequence type : types) { + this.objects.get(type).save(output, listener); + } + } + + private void readLiteralsMap(InputStream input, ProgressListener listener) throws IOException { + int numberOfTypes = (int) VByte.decode(input); + List types = new ArrayList<>(); + for (int i = 0; i < numberOfTypes; i++) { + int length = (int) VByte.decode(input); + byte[] type = IOUtil.readBuffer(input, length, listener); + types.add(new CompactString(type)); + } + for (CharSequence type : types) { + this.objects.put(type, DictionarySectionFactory.loadFrom(input, listener)); + } + } + + private void mapLiteralsMap(CountInputStream input, File f, ProgressListener listener) throws IOException { + int numberOfTypes = (int) VByte.decode(input); + List types = new ArrayList<>(); + for (int i = 0; i < numberOfTypes; i++) { + int length = (int) VByte.decode(input); + byte[] type = IOUtil.readBuffer(input, length, listener); + types.add(new CompactString(type)); + } + for (CharSequence type : types) { + this.objects.put(type, DictionarySectionFactory.loadFrom(input, f, listener)); + } + + } + + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#load(java.io.InputStream) + */ + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { + if (ci.getType() != ControlInfo.Type.DICTIONARY) { + throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); + } + + IntermediateListener iListener = new IntermediateListener(listener); + + shared = DictionarySectionFactory.loadFrom(input, iListener); + subjects = DictionarySectionFactory.loadFrom(input, iListener); + predicates = DictionarySectionFactory.loadFrom(input, iListener); + + readLiteralsMap(input, listener); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { + ControlInformation ci = new ControlInformation(); + ci.load(in); + if (ci.getType() != ControlInfo.Type.DICTIONARY) { + throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); + } + + IntermediateListener iListener = new IntermediateListener(listener); + shared = DictionarySectionFactory.loadFrom(in, f, iListener); + subjects = DictionarySectionFactory.loadFrom(in, f, iListener); + predicates = DictionarySectionFactory.loadFrom(in, f, iListener); + + mapLiteralsMap(in, f, listener); + + // Use cache only for predicates. Preload only up to 100K predicates. + // FIXME: DISABLED // predicates = new DictionarySectionCacheAll(predicates, predicates.getNumberOfElements()<100000); - } - - @Override - public long getNAllObjects() { - Iterator hmIterator = objects.entrySet().iterator(); - long count = 0; - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - count += ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); - } - return count; - } - - @Override - public TreeMap getAllObjects() { - return new TreeMap<>(objects); - } - - /* (non-Javadoc) - * @see hdt.dictionary.Dictionary#populateHeader(hdt.header.Header, java.lang.String) - */ - @Override - public void populateHeader(Header header, String rootNode) { - header.insert(rootNode, HDTVocabulary.DICTIONARY_TYPE, getType()); + } + + @Override + public long getNAllObjects() { + return objects.values().stream().mapToLong(DictionarySectionPrivate::getNumberOfElements).sum(); + } + + @Override + public TreeMap getAllObjects() { + return new TreeMap<>(objects); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#populateHeader(hdt.header.Header, java.lang.String) + */ + @Override + public void populateHeader(Header header, String rootNode) { + header.insert(rootNode, HDTVocabulary.DICTIONARY_TYPE, getType()); // header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSUBJECTS, getNsubjects()); // header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMPREDICATES, getNpredicates()); // header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMOBJECTS, getNobjects()); - header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSHARED, getNshared()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSHARED, getNshared()); // header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXSUBJECTID, getMaxSubjectID()); // header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXPREDICATEID, getMaxPredicateID()); // header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXOBJECTTID, getMaxObjectID()); - header.insert(rootNode, HDTVocabulary.DICTIONARY_SIZE_STRINGS, size()); - } - - /* (non-Javadoc) - * @see hdt.dictionary.Dictionary#getType() - */ - @Override - public String getType() { - return HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION; - } - - @Override - public void close() throws IOException { - shared.close(); - subjects.close(); - predicates.close(); - - // close all subsections - Iterator hmIterator = objects.entrySet().iterator(); - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - ((DictionarySectionPrivate)entry.getValue()).close(); - } - - } - - @Override - public void loadAsync(TempDictionary other, ProgressListener listener) { - throw new NotImplementedException(); - } + header.insert(rootNode, HDTVocabulary.DICTIONARY_SIZE_STRINGS, size()); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#getType() + */ + @Override + public String getType() { + return HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION; + } + + @Override + public void close() throws IOException { + shared.close(); + subjects.close(); + predicates.close(); + + // close all subsections + IOUtil.closeAll(objects.values()); + } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) { + throw new NotImplementedException(); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java index e859fd32..3a058f1e 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java @@ -1,7 +1,7 @@ package org.rdfhdt.hdt.dictionary.impl; +import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.dictionary.DictionarySection; -import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory; import org.rdfhdt.hdt.dictionary.impl.section.HashDictionarySection; @@ -18,12 +18,17 @@ import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.CompactString; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.*; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; public class MultipleSectionDictionaryBig extends MultipleBaseDictionary { @@ -34,7 +39,7 @@ public MultipleSectionDictionaryBig(HDTOptions spec) { // FIXME: Read type from spec. subjects = new PFCDictionarySectionBig(spec); predicates = new PFCDictionarySectionBig(spec); - objects = new TreeMap<>(); + objects = new TreeMap<>(CharSequenceComparator.getInstance()); shared = new PFCDictionarySectionBig(spec); } @@ -48,14 +53,13 @@ public void load(TempDictionary other, ProgressListener listener) { predicates.load(other.getPredicates(), iListener); Iterator iter = other.getObjects().getEntries(); - HashMap literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); - if(literalsCounts.containsKey("NO_DATATYPE")) - literalsCounts.put("NO_DATATYPE",literalsCounts.get("NO_DATATYPE") - other.getShared().getNumberOfElements()); + Map literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); + literalsCounts.computeIfPresent(LiteralsUtils.NO_DATATYPE, (key, value) -> (value - other.getShared().getNumberOfElements())); CustomIterator customIterator = new CustomIterator(iter,literalsCounts); while (customIterator.hasNext()){ PFCDictionarySectionBig section = new PFCDictionarySectionBig(spec); - String type = LiteralsUtils.getType(customIterator.prev); + String type = LiteralsUtils.getType(customIterator.prev).toString(); long numEntries = literalsCounts.get(type); section.load(customIterator,numEntries,listener); @@ -88,45 +92,36 @@ public void save(OutputStream output, ControlInfo ci, ProgressListener listener) ------------------ */ private void writeLiteralsMap(OutputStream output,ProgressListener listener) throws IOException { - Iterator hmIterator = objects.entrySet().iterator(); int numberOfTypes = objects.size(); - output.write(numberOfTypes); + VByte.encode(output, numberOfTypes); - ArrayList types = new ArrayList<>(); + ArrayList types = new ArrayList<>(); - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - String uri = (String)entry.getKey(); - output.write(uri.length()); - IOUtil.writeBuffer(output, uri.getBytes(), 0, uri.getBytes().length, listener); - types.add(uri); + for (CharSequence uriKey : objects.keySet()) { + IOUtil.writeSizedBuffer(output, uriKey.toString().getBytes(), listener); + types.add(uriKey); } - for(String type:types){ + for(CharSequence type:types){ this.objects.get(type).save(output,listener); } } private void readLiteralsMap(InputStream input,ProgressListener listener) throws IOException { - int numberOfTypes = input.read(); - ArrayList types = new ArrayList<>(); + int numberOfTypes = (int) VByte.decode(input); + ArrayList types = new ArrayList<>(); for (int i = 0; i < numberOfTypes; i++) { - int length = input.read(); - byte[] type = IOUtil.readBuffer(input, length, listener); - types.add(new String(type)); + types.add(new CompactString(IOUtil.readSizedBuffer(input, listener))); } - for(String type:types){ + for(CharSequence type : types){ this.objects.put(type,DictionarySectionFactory.loadFrom(input,listener)); } } private void mapLiteralsMap(CountInputStream input,File f,ProgressListener listener) throws IOException { - int numberOfTypes = input.read(); - ArrayList types = new ArrayList<>(); + int numberOfTypes = (int) VByte.decode(input); + ArrayList types = new ArrayList<>(); for (int i = 0; i < numberOfTypes; i++) { - int length = input.read(); - byte[] type = IOUtil.readBuffer(input, length, listener); - String typeStr = new String(type); - types.add(typeStr); + types.add(new CompactString(IOUtil.readSizedBuffer(input, listener))); } - for(String type:types){ + for(CharSequence type : types){ this.objects.put(type,DictionarySectionFactory.loadFrom(input,f,listener)); } @@ -173,17 +168,11 @@ public void mapFromFile(CountInputStream in, File f, ProgressListener listener) @Override public long getNAllObjects() { - Iterator hmIterator = objects.entrySet().iterator(); - long count = 0; - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - count += ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); - } - return count; + return objects.values().stream().mapToLong(DictionarySection::getNumberOfElements).sum(); } @Override - public TreeMap getAllObjects() { + public Map getAllObjects() { return new TreeMap<>(objects); } @@ -213,16 +202,18 @@ public String getType() { @Override public void close() throws IOException { - shared.close(); - subjects.close(); - predicates.close(); - - // close all subsections - Iterator hmIterator = objects.entrySet().iterator(); - while (hmIterator.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator.next(); - ((DictionarySectionPrivate)entry.getValue()).close(); + try { + shared.close(); + } finally { + try { + subjects.close(); + } finally { + try { + predicates.close(); + } finally { + IOUtil.closeAll(objects.values()); + } + } } - } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java index 5fd5c9f5..8ec21b20 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java @@ -25,11 +25,17 @@ import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionaryCat; import org.rdfhdt.hdt.dictionary.DictionarySection; -import org.rdfhdt.hdt.dictionary.impl.utilCat.*; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatElement; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatIntersection; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMapping; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMappingBack; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatUnion; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatWrapper; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.ControlInformation; +import org.rdfhdt.hdt.util.LiteralsUtils; import org.rdfhdt.hdt.util.crc.CRC32; import org.rdfhdt.hdt.util.crc.CRC8; import org.rdfhdt.hdt.util.crc.CRCOutputStream; @@ -37,12 +43,17 @@ import org.rdfhdt.hdt.util.listener.ListenerUtil; import org.rdfhdt.hdt.util.listener.PrefixListener; import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.CompactString; -import java.io.*; +import java.io.ByteArrayOutputStream; +import java.io.FileOutputStream; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -51,7 +62,7 @@ public class MultipleSectionDictionaryCat implements DictionaryCat { private static final int DEFAULT_BLOCK_SIZE = 16; private static final int BLOCK_PER_BUFFER = 1000000; - private static final String NO_DT_OBJECTS = "NO_DATATYPE"; + private static final CharSequence NO_DT_OBJECTS = LiteralsUtils.NO_DATATYPE; private final String location; private long numShared; @@ -63,8 +74,7 @@ public MultipleSectionDictionaryCat(String location) { } public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener listener) throws IOException { - - + Comparator comparator = CharSequenceComparator.getInstance(); // Initialize all mappings ...... allMappings.put("P1",new CatMapping(location,"P1",dictionary1.getPredicates().getNumberOfElements())); @@ -75,25 +85,27 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener allMappings.put("O2",new CatMapping(location, "O2",dictionary2.getNAllObjects())); allMappings.put("SH1",new CatMapping(location,"SH1",dictionary1.getShared().getNumberOfElements())); allMappings.put("SH2",new CatMapping(location,"SH2",dictionary2.getShared().getNumberOfElements())); - Iterator> hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); + Iterator> hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); int countSubSections1 = 0; int countSubSections2 = 0; while (hmIterator1.hasNext()){ - Map.Entry entry = hmIterator1.next(); + Map.Entry entry = hmIterator1.next(); String prefix = "sub"+countSubSections1; - if((entry.getKey()).equals(NO_DT_OBJECTS)) - prefix = entry.getKey(); + if((entry.getKey()).equals(NO_DT_OBJECTS)) { + prefix = entry.getKey().toString(); + } allMappings.put(prefix+"1",new CatMapping(location,prefix+"1", entry.getValue().getNumberOfElements())); countSubSections1++; } - Iterator> hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); + Iterator> hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); while (hmIterator2.hasNext()){ - Map.Entry entry = hmIterator2.next(); + Map.Entry entry = hmIterator2.next(); String prefix = "sub"+countSubSections2; - if((entry.getKey()).equals(NO_DT_OBJECTS)) - prefix = entry.getKey(); + if((entry.getKey()).equals(NO_DT_OBJECTS)) { + prefix = entry.getKey().toString(); + } allMappings.put(prefix+"2",new CatMapping(location,prefix+"2", entry.getValue().getNumberOfElements())); countSubSections2++; } @@ -218,22 +230,22 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); boolean skip1 = false; boolean skip2 = false; - String dataType1 = ""; - String dataType2 = ""; + CharSequence dataType1 = CompactString.EMPTY; + CharSequence dataType2 = CompactString.EMPTY; DictionarySection section1 = null; DictionarySection section2 = null; while (hmIterator1.hasNext() || hmIterator2.hasNext()){ if(hmIterator1.hasNext()){ if(!skip1) { - Map.Entry entry1 = hmIterator1.next(); + Map.Entry entry1 = hmIterator1.next(); section1 = entry1.getValue(); dataType1 = entry1.getKey(); } } if(hmIterator2.hasNext()){ if(!skip2){ - Map.Entry entry2 = hmIterator2.next(); + Map.Entry entry2 = hmIterator2.next(); section2 = entry2.getValue(); dataType2 = entry2.getKey(); } @@ -244,10 +256,11 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener new CatWrapper(section2.getSortedEntries(), dataType2 + "_2") )); }else{ - if(dataType1.compareTo(dataType2) > 0){ + int comp = comparator.compare(dataType1, dataType2); + if(comp > 0){ skip1 = true; skip2 = false; - }else if(dataType1.compareTo(dataType2) < 0){ + } else if(comp < 0){ skip1 = false; skip2 = true; } @@ -293,17 +306,17 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); int type = 4; - ArrayList dataTypes = new ArrayList<>(); + ArrayList dataTypes = new ArrayList<>(); // iterate over objects subsections and cat them together countSubSections1 = 0; countSubSections2 = 0; - HashMap offsets = new HashMap<>(); + HashMap offsets = new HashMap<>(); long total = 0; skip1 = false; skip2 = false; - dataType1 = ""; - dataType2 = ""; + dataType1 = CompactString.EMPTY; + dataType2 = CompactString.EMPTY; section1 = null; section2 = null; String prefix1 = ""; @@ -314,27 +327,29 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener ArrayList> countObjectsList = new ArrayList<>(); if(hmIterator1.hasNext()){ if(!skip1) { - Map.Entry entry = hmIterator1.next(); + Map.Entry entry = hmIterator1.next(); dataType1 = entry.getKey(); section1 = entry.getValue(); prefix1 = "sub" + countSubSections1; - if (dataType1.equals(NO_DT_OBJECTS)) - prefix1 = dataType1; + if (dataType1.equals(NO_DT_OBJECTS)) { + prefix1 = dataType1.toString(); + } countSubSections1++; } } if(hmIterator2.hasNext()){ if(!skip2) { - Map.Entry entry = hmIterator2.next(); + Map.Entry entry = hmIterator2.next(); dataType2 = entry.getKey(); section2 = entry.getValue(); prefix2 = "sub" + countSubSections2; - if (dataType2.equals(NO_DT_OBJECTS)) - prefix2 = dataType2; + if (dataType2.equals(NO_DT_OBJECTS)) { + prefix2 = dataType2.toString(); + } countSubSections2++; } } - String dataType = ""; + CharSequence dataType = CompactString.EMPTY; if(section1 != null && section2 != null && dataType1.equals(dataType2)){ dataType = dataType1; addObjectsList.add(new CatWrapper( @@ -357,23 +372,26 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener skip2 = false; if(!hmIterator1.hasNext()){ section1 = null; - dataType1 = ""; + dataType1 = CompactString.EMPTY; }else if(!hmIterator2.hasNext()){ section2 = null; - dataType2 = ""; + dataType2 = CompactString.EMPTY; } }else{ boolean fromOne = false; boolean fromTwo = false; - if(dataType1.equals("")){ + if(dataType1.length() == 0){ fromTwo = true; - }else if(dataType2.equals("")){ + }else if(dataType2.length() == 0){ fromOne = true; } - if(dataType1.compareTo(dataType2) < 0) + int comp = comparator.compare(dataType1, dataType2); + if(comp < 0) { fromOne = true; - if(dataType1.compareTo(dataType2) > 0) + } + if(comp > 0) { fromTwo = true; + } if(section1!= null && fromOne){ // section 1 before section 2 dataType = dataType1; addObjectsList.add(new CatWrapper( @@ -404,7 +422,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener ); if(!hmIterator2.hasNext()){ section2 = null; - dataType2 = ""; + dataType2 = CompactString.EMPTY; skip1 = false; }else { skip1 = true; @@ -492,17 +510,20 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener try (FileOutputStream outFinal = new FileOutputStream(location + "dictionary")) { ci.save(outFinal); - for (int i = 1; i <= 3 + dataTypes.size(); i++) { - if(i == 4){ // write literals map before writing the objects sections - outFinal.write(dataTypes.size()); - for(String datatype:dataTypes){ - outFinal.write(datatype.length()); - IOUtil.writeBuffer(outFinal, datatype.getBytes(), 0, datatype.getBytes().length, iListener); - } - } + for (int i = 1; i <= 3; i++) { Files.copy(Path.of(location + "section" + i), outFinal); Files.delete(Path.of(location + "section" + i)); } + VByte.encode(outFinal, dataTypes.size()); + for(CharSequence datatype:dataTypes){ + String datatypeStr = datatype.toString(); + byte[] bytes = datatypeStr.getBytes(); + IOUtil.writeSizedBuffer(outFinal, bytes, 0, bytes.length, iListener); + } + for (int i = 0; i < dataTypes.size(); i++) { + Files.copy(Path.of(location + "section" + (4 + i)), outFinal); + Files.delete(Path.of(location + "section" + (4 + i))); + } } // create the objects mappings long oldId = 0; @@ -510,8 +531,8 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener countSubSections1 = 0; countSubSections2 = 0; while (hmIterator1.hasNext()){ - Map.Entry entry = hmIterator1.next(); - String dataType = entry.getKey(); + Map.Entry entry = hmIterator1.next(); + CharSequence dataType = entry.getKey(); String prefix = "sub"+countSubSections1; if(dataType.equals(NO_DT_OBJECTS)) prefix = dataType+"1"; @@ -534,8 +555,8 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener oldId = 0; hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); while (hmIterator2.hasNext()){ - Map.Entry entry = hmIterator2.next(); - String dataType = entry.getKey(); + Map.Entry entry = hmIterator2.next(); + CharSequence dataType = entry.getKey(); String prefix = "sub"+countSubSections2; if(dataType.equals(NO_DT_OBJECTS)) prefix = dataType+"2"; @@ -673,7 +694,8 @@ private void catSection(long numEntries, int type, CatUnion itAdd , CatUnion itS blocks.aggressiveTrimToSize(); byteOut.flush(); //section.addBuffer(buffer, byteOut.toByteArray()); - IOUtil.writeBuffer(outBuffer, byteOut.toByteArray(), 0, byteOut.toByteArray().length, null); + byte[] bytes = byteOut.toByteArray(); + IOUtil.writeBuffer(outBuffer, bytes, 0, bytes.length, null); outBuffer.writeCRC(); } //Save the section conforming to the HDT format diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java index b56cf3d2..e6c23d59 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java @@ -2,6 +2,7 @@ import org.rdfhdt.hdt.compact.bitmap.Bitmap; import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; +import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionaryDiff; import org.rdfhdt.hdt.dictionary.DictionarySection; @@ -14,24 +15,24 @@ import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.ControlInformation; +import org.rdfhdt.hdt.util.LiteralsUtils; import org.rdfhdt.hdt.util.io.IOUtil; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; -import java.nio.file.Paths; +import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.List; import java.util.Map; public class MultipleSectionDictionaryDiff implements DictionaryDiff { private final String location; - private final HashMap allMappings = new HashMap<>(); + private final Map allMappings = new HashMap<>(); private CatMapping mappingBack; public long numShared; public MultipleSectionDictionaryDiff(String location){ @@ -48,15 +49,15 @@ public void close() throws IOException { } } @Override - public void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException { + public void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException { allMappings.put("predicate",new CatMapping(location,"predicate",dictionary.getPredicates().getNumberOfElements())); allMappings.put("subject",new CatMapping(location,"subject",dictionary.getSubjects().getNumberOfElements())); int countSubSection = 0; - for (Map.Entry next : dictionary.getAllObjects().entrySet()) { + for (Map.Entry next : dictionary.getAllObjects().entrySet()) { String subPrefix = "sub"+countSubSection; - if(next.getKey().equals("NO_DATATYPE")){ - subPrefix = "NO_DATATYPE"; + if(next.getKey().equals(LiteralsUtils.NO_DATATYPE)){ + subPrefix = LiteralsUtils.NO_DATATYPE.toString(); } allMappings.put(subPrefix,new CatMapping(location,subPrefix,next.getValue().getNumberOfElements())); countSubSection++; @@ -108,18 +109,18 @@ public void diff(Dictionary dictionary, Map bitmaps, P // Objects ----------------------------+++++++++++++++++++++++++++++++++---------------------------------------- - ArrayList dataTypes = new ArrayList<>(); - HashMap offsets = new HashMap<>(); + List dataTypes = new ArrayList<>(); + Map offsets = new HashMap<>(); int countSection = 0; long totalObjects = 0; - for (Map.Entry next : dictionary.getAllObjects().entrySet()) { + for (Map.Entry next : dictionary.getAllObjects().entrySet()) { int type = 4 + dataTypes.size(); - if(next.getKey().equals("NO_DATATYPE")){ + if(next.getKey().equals(LiteralsUtils.NO_DATATYPE)){ long numNoDataType = createNoDataTypeSection(bitmaps, dictionary,totalObjects,type); if(numNoDataType > 0){ - dataTypes.add("NO_DATATYPE"); - offsets.put("NO_DATATYPE",totalObjects); + dataTypes.add(LiteralsUtils.NO_DATATYPE); + offsets.put(LiteralsUtils.NO_DATATYPE,totalObjects); totalObjects+= numNoDataType; } }else { @@ -184,32 +185,29 @@ public void diff(Dictionary dictionary, Map bitmaps, P try (OutputStream out = new FileOutputStream(location + "dictionary")) { ci.save(out); - byte[] buf = new byte[100000]; - for (int i = 1; i <= 3 +dataTypes.size(); i++) { - if(i == 4) { // write literals map before writing the objects sections - out.write(dataTypes.size()); - for (String datatype : dataTypes){ - out.write(datatype.length()); - IOUtil.writeBuffer(out, datatype.getBytes(), 0, datatype.getBytes().length, listener); - } - } - try (InputStream in = new FileInputStream(location + "section" + i)) { - int b; - while ((b = in.read(buf)) >= 0) { - out.write(buf, 0, b); - } - } - Files.delete(Paths.get(location + "section" + i)); + for (int i = 1; i <= 3; i++) { + Files.copy(Path.of(location + "section" + i), out); + Files.delete(Path.of(location + "section" + i)); + } + VByte.encode(out, dataTypes.size()); + for(CharSequence datatype:dataTypes){ + String datatypeStr = datatype.toString(); + byte[] bytes = datatypeStr.getBytes(); + IOUtil.writeSizedBuffer(out, bytes, 0, bytes.length, listener); + } + for (int i = 0; i < dataTypes.size(); i++) { + Files.copy(Path.of(location + "section" + (4 + i)), out); + Files.delete(Path.of(location + "section" + (4 + i))); } } // create global objects mapping from section by section mappings long oldId = 0; countSection = 0; - for (Map.Entry next : dictionary.getAllObjects().entrySet()) { - String dataType = next.getKey(); + for (CharSequence dataType : dictionary.getAllObjects().keySet()) { String subPrefix = "sub"+countSection; - if(dataType.equals("NO_DATATYPE")) - subPrefix = dataType; + if(dataType.equals(LiteralsUtils.NO_DATATYPE)) { + subPrefix = dataType.toString(); + } if(allMappings.containsKey(subPrefix)){ CatMapping mapping = allMappings.get(subPrefix); @@ -247,11 +245,11 @@ public void diff(Dictionary dictionary, Map bitmaps, P } } - private long createNoDataTypeSection(Map bitmaps,Dictionary dictionary,long numObjectsAlreadyAdded,int type) throws IOException { - Bitmap objectsBitMap = bitmaps.get("NO_DATATYPE"); - Iterator objects = dictionary.getAllObjects().get("NO_DATATYPE").getSortedEntries(); + private long createNoDataTypeSection(Map bitmaps,Dictionary dictionary,long numObjectsAlreadyAdded,int type) throws IOException { + Bitmap objectsBitMap = bitmaps.get(LiteralsUtils.NO_DATATYPE); + Iterator objects = dictionary.getAllObjects().get(LiteralsUtils.NO_DATATYPE).getSortedEntries(); - DiffWrapper itSkipObjs = new DiffWrapper(objects, objectsBitMap,"NO_DATATYPE"); + DiffWrapper itSkipObjs = new DiffWrapper(objects, objectsBitMap,LiteralsUtils.NO_DATATYPE); ArrayList> listSkipObjs = new ArrayList<>(); listSkipObjs.add(itSkipObjs); @@ -314,7 +312,7 @@ public int count() { return i; } } - public HashMap getAllMappings() { + public Map getAllMappings() { return allMappings; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java new file mode 100644 index 00000000..293bb853 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java @@ -0,0 +1,200 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; +import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +/** + * Version of mutli-section dictionary with {@link org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection} + * @author Antoine Willerval + */ +public class WriteMultipleSectionDictionary extends MultipleBaseDictionary { + private final Path filename; + private final int bufferSize; + public WriteMultipleSectionDictionary(HDTOptions spec, Path filename, int bufferSize) { + super(spec); + this.filename = filename; + this.bufferSize = bufferSize; + String name = filename.getFileName().toString(); + subjects = new WriteDictionarySection(spec, filename.resolveSibling(name + "SU"), bufferSize); + predicates = new WriteDictionarySection(spec, filename.resolveSibling(name + "PR"), bufferSize); + objects = new TreeMap<>(CharSequenceComparator.getInstance()); + shared = new WriteDictionarySection(spec, filename.resolveSibling(name + "SH"), bufferSize); + } + + @Override + public long getNAllObjects() { + return objects.values().stream().mapToLong(DictionarySectionPrivate::getNumberOfElements).sum(); + } + + private ExceptionThread fillSection(Iterator objects, ProgressListener listener) throws InterruptedException { + PipedCopyIterator noDatatypeIterator = new PipedCopyIterator<>(); + PipedCopyIterator datatypeIterator = new PipedCopyIterator<>(); + String name = filename.getFileName().toString(); + WriteDictionarySection noDatatypeSection = new WriteDictionarySection(spec, filename.resolveSibling(name + LiteralsUtils.NO_DATATYPE), bufferSize); + this.objects.put(LiteralsUtils.NO_DATATYPE, noDatatypeSection); + return new ExceptionThread(() -> { + // object reader + try { + CharSequence oldType = null; + boolean noDatatype = false; + while (objects.hasNext()) { + CharSequence next = objects.next(); + + CharSequence type = LiteralsUtils.getType(next); + + if (oldType != null) { + if (oldType.equals(type)) { + if (noDatatype) { + noDatatypeIterator.addElement(next); + } else { + datatypeIterator.addElement(next); + } + continue; + } else { + if (!noDatatype) { + datatypeIterator.closePipe(); + } + } + } + oldType = type; + + if (LiteralsUtils.isNoDatatype(type)) { + noDatatypeIterator.addElement(next); + noDatatype = true; + } else { + datatypeIterator.addElement(next); + noDatatype = false; + } + } + noDatatypeIterator.closePipe(); + datatypeIterator.closePipe(); + } catch (Throwable e) { + try { + throw e; + } finally { + try { + noDatatypeIterator.closePipe(e); + } finally { + datatypeIterator.closePipe(e); + } + } + } + }, "MultiSecSAsyncObjectReader").attach(new ExceptionThread(() -> { + // datatype writer + throw new NotImplementedException("MultiSecSAsyncObjectReader"); + }, "MultiSecSAsyncObjectDatatypeWriter")).attach(new ExceptionThread(() -> { + // no datatype writer +// noDatatypeSection.load(new OneReadDictionarySection(noDatatypeIterator), ); + throw new NotImplementedException("MultiSecSAsyncObjectReader"); + }, "MultiSecSAsyncObjectNoDatatypeWriter")); + } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + MultiThreadListener ml = ListenerUtil.multiThreadListener(listener); + ml.unregisterAllThreads(); + ExceptionThread.async("MultiSecSAsyncReader", + () -> predicates.load(other.getPredicates(), new IntermediateListener(ml, "Predicate: ")), + () -> subjects.load(other.getSubjects(), new IntermediateListener(ml, "Subjects: ")), + () -> shared.load(other.getShared(), new IntermediateListener(ml, "Shared: ")) + ).attach(fillSection(other.getObjects().getEntries(), new IntermediateListener(ml, "Objects: "))) + .startAll() + .joinAndCrashIfRequired(); + ml.unregisterAllThreads(); + } + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + ci.setType(ControlInfo.Type.DICTIONARY); + ci.setFormat(getType()); + ci.setInt("elements", this.getNumberOfElements()); + ci.save(output); + + IntermediateListener iListener = new IntermediateListener(listener); + iListener.setRange(0, 25); + iListener.setPrefix("Save shared: "); + shared.save(output, iListener); + iListener.setRange(25, 50); + iListener.setPrefix("Save subjects: "); + subjects.save(output, iListener); + iListener.setRange(50, 75); + iListener.setPrefix("Save predicates: "); + predicates.save(output, iListener); + iListener.setRange(75, 100); + iListener.setPrefix("Save objects: "); + + VByte.encode(output, objects.size()); + + for (Map.Entry entry : objects.entrySet()) { + IOUtil.writeSizedBuffer(output, entry.getKey().toString().getBytes(ByteStringUtil.STRING_ENCODING), listener); + } + + for (Map.Entry entry : objects.entrySet()) { + entry.getValue().save(output, iListener); + } + + } + + @Override + public void close() throws IOException { + try { + IOUtil.closeAll(shared, subjects, predicates); + } finally { + IOUtil.closeAll(objects.values()); + } + } + + @Override + public void populateHeader(Header header, String rootNode) { + header.insert(rootNode, HDTVocabulary.DICTIONARY_TYPE, getType()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSHARED, getNshared()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_SIZE_STRINGS, size()); + } + + @Override + public String getType() { + return HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION; + } + + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void load(TempDictionary other, ProgressListener listener) { + throw new NotImplementedException(); + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java index e4eb79b5..faaa4491 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java @@ -30,14 +30,17 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.Map; import org.rdfhdt.hdt.dictionary.TempDictionarySection; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CharSequenceCustomComparator; import org.rdfhdt.hdt.util.string.CompactString; @@ -53,24 +56,19 @@ public class HashDictionarySection implements TempDictionarySection { private List list; private int size; public boolean sorted; - boolean isCustom; - private HashMap literalsCounts; + final boolean isCustom; + private final Map literalsCounts = new HashMap<>(); /** * */ public HashDictionarySection(boolean isCustom) { - this(new HDTSpecification()); this.isCustom = isCustom; - } - public HashDictionarySection() { - this(new HDTSpecification()); - this.isCustom = isCustom; - } - public HashDictionarySection(HDTOptions spec) { map = new HashMap<>(); list = new ArrayList<>(); size=0; - literalsCounts = new HashMap<>(); + } + public HashDictionarySection() { + this(false); } /* (non-Javadoc) @@ -78,8 +76,7 @@ public HashDictionarySection(HDTOptions spec) { */ @Override public long locate(CharSequence s) { - CompactString compact = new CompactString(s); - Long val = map.get(compact); + Long val = map.get(ByteStringUtil.asByteString(s)); if(val==null) { return 0; } @@ -132,31 +129,20 @@ public Iterator getEntries() { @Override public long add(CharSequence entry) { CharSequence compact = new CompactString(entry); - Long pos = map.get(compact); - if(pos!=null) { - // Found return existing ID. - return pos; - } - - // Not found, insert new - list.add(compact); - map.put(compact, (long) list.size()); - - size+=compact.length(); - sorted = false; - - // custom for subsection literals .. - if(isCustom){ - String type = LiteralsUtils.getType(entry); - // check if the entry doesn't already exists - if(map.get(entry) == null) { - if (literalsCounts.containsKey(type)) { - literalsCounts.put(type, literalsCounts.get(type) + 1L); - } else - literalsCounts.put(type, 1L); + return map.computeIfAbsent(compact, key -> { + // Not found, insert new + list.add(compact); + size+=compact.length(); + sorted = false; + + // custom for subsection literals .. + if (isCustom) { + CharSequence type = LiteralsUtils.getType(compact); + // check if the entry doesn't already exist + literalsCounts.compute(type, (key2, count) -> count == null ? 1L : count + 1L); } - } - return list.size(); + return (long) list.size(); + }); } @Override @@ -169,18 +155,17 @@ public void remove(CharSequence seq) { public void sort() { // Update list. list = new ArrayList<>(map.size()); - for(CharSequence str : map.keySet()) { - list.add(str); - } + list.addAll(map.keySet()); // Sort list - if(isCustom) - Collections.sort(list, new CharSequenceCustomComparator()); - else - Collections.sort(list, new CharSequenceComparator()); + if (isCustom) { + list.sort(new CharSequenceCustomComparator()); + } else { + list.sort(new CharSequenceComparator()); + } // Update map indexes - for(long i=1;i<=getNumberOfElements();i++) { + for (long i = 1; i <= getNumberOfElements(); i++) { map.put(extract(i), i); } @@ -196,17 +181,18 @@ public boolean isSorted() { public void clear() { list.clear(); map.clear(); - size=0; + size = 0; sorted = false; //because if sorted won't be anymore } @Override public void close() throws IOException { - map=null; - list=null; + map = null; + list = null; } - public HashMap getLiteralsCounts() { + @Override + public Map getLiteralsCounts() { return literalsCounts; } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java index 1af57850..f28e97f0 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java @@ -5,16 +5,29 @@ import java.io.IOException; import java.util.Iterator; - +import java.util.Map; +import java.util.concurrent.atomic.AtomicReference; + +/** + * dictionary section assuming the {@link #getSortedEntries()} or the {@link #getEntries()} will only be called once, + * it will return an {@link java.lang.IllegalArgumentException} otherwise. + * + * @author Antoine Willerval + */ public class OneReadDictionarySection implements TempDictionarySection { - private final Iterator reader; private final long size; + private final AtomicReference> ref = new AtomicReference<>(); public OneReadDictionarySection(Iterator reader, long size) { - this.reader = reader; + ref.set(reader); this.size = size; } + @Override + public Map getLiteralsCounts() { + return TempDictionarySection.super.getLiteralsCounts(); + } + @Override public long add(CharSequence str) { throw new NotImplementedException(); @@ -42,7 +55,13 @@ public boolean isSorted() { @Override public Iterator getEntries() { - return reader; + Iterator it = ref.getAndSet(null); + + if (it == null) { + throw new IllegalArgumentException("This dictionary has already been get"); + } + + return it; } @Override @@ -67,10 +86,11 @@ public long getNumberOfElements() { @Override public Iterator getSortedEntries() { - return reader; + return getEntries(); } @Override public void close() throws IOException { + ref.getAndSet(null); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java index 721d9225..ef4e88fa 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java @@ -84,7 +84,7 @@ public PFCDictionarySection(HDTOptions spec) { public void load(TempDictionarySection other, ProgressListener listener) { this.blocks = new SequenceLog64(BitUtil.log2(other.size()), other.getNumberOfElements()/blocksize); Iterator it = other.getSortedEntries(); - this.load((Iterator)it, other.getNumberOfElements(), listener); + this.load(it, other.getNumberOfElements(), listener); } public void load(PFCDictionarySectionBuilder builder) throws IOException { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBuilder.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBuilder.java index f83dfc39..57d93728 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBuilder.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBuilder.java @@ -11,9 +11,9 @@ public class PFCDictionarySectionBuilder { // FIXME: Due to java array indexes being int, only 2GB can be addressed per dictionary section. private byte [] text=new byte[0]; // Encoded sequence - private int blocksize; + private final int blocksize; private int numstrings; - private SequenceLog64 blocks; + private final SequenceLog64 blocks; ByteArrayOutputStream byteOut = new ByteArrayOutputStream(16*1024); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java index f8dcdd72..f172d6fb 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java @@ -15,7 +15,10 @@ import org.rdfhdt.hdt.util.io.CountOutputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.ListenerUtil; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.CompactString; +import org.rdfhdt.hdt.util.string.ReplazableString; import java.io.IOException; import java.io.InputStream; @@ -26,7 +29,8 @@ /** * Implementation of {@link org.rdfhdt.hdt.dictionary.DictionarySectionPrivate} that write loaded - * {@link org.rdfhdt.hdt.dictionary.TempDictionarySection} on disk before saving, reducing the size in ram + * {@link org.rdfhdt.hdt.dictionary.TempDictionarySection} on disk before saving, reducing the size in ram, + * iterator should be a byte string * * @author Antoine Willerval */ @@ -44,13 +48,9 @@ public WriteDictionarySection(HDTOptions spec, Path filename, int bufferSize) { String fn = filename.getFileName().toString(); tempFilename = CloseSuppressPath.of(filename.resolveSibling(fn + "_temp")); blockTempFilename = CloseSuppressPath.of(filename.resolveSibling(fn + "_tempblock")); - long blockSize = spec.getInt("pfc.blocksize"); + blockSize = spec.getInt("pfc.blocksize", PFCDictionarySection.DEFAULT_BLOCK_SIZE); if (blockSize < 0) { throw new IllegalArgumentException("negative pfc.blocksize"); - } else if (blockSize == 0) { - this.blockSize = PFCDictionarySection.DEFAULT_BLOCK_SIZE; - } else { - this.blockSize = blockSize; } } @@ -65,10 +65,10 @@ public void load(TempDictionarySection other, ProgressListener plistener) { listener.notifyProgress(0, "Filling section"); try (CountOutputStream out = new CountOutputStream(tempFilename.openOutputStream(bufferSize))) { CRCOutputStream crcout = new CRCOutputStream(out, new CRC32()); - String previousStr = null; + ByteString previousStr = null; for (Iterator it = other.getSortedEntries(); it.hasNext(); currentCount++) { - CharSequence sec = it.next(); - String str = sec.toString(); + ByteString str = (ByteString) (it.next()); + assert str != null; if (numberElements % blockSize == 0) { blocks.append(out.getTotalBytes()); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatMappingBack.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatMappingBack.java index 3da1f119..67f4d177 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatMappingBack.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatMappingBack.java @@ -26,27 +26,26 @@ import org.apache.jena.ext.com.google.common.io.Closeables; import org.rdfhdt.hdt.util.disk.LongArrayDisk; +import org.rdfhdt.hdt.util.io.IOUtil; /** * @author Dennis Diefenbach & Jose Gimenez Garcia */ public class CatMappingBack implements Closeable { - private String location; - private long size; - private LongArrayDisk mapping1; - private LongArrayDisk mappingType1; - private LongArrayDisk mapping2; - private LongArrayDisk mappingType2; + private final long size; + private final LongArrayDisk mapping1; + private final LongArrayDisk mappingType1; + private final LongArrayDisk mapping2; + private final LongArrayDisk mappingType2; public CatMappingBack(String location, long size){ - this.location = location; this.size = size+1; this.mapping1 = new LongArrayDisk(location+"mapping_back_1",this.size); - this.mapping2 = new LongArrayDisk(location+"mapping_back_2",this.size);; - this.mappingType1 = new LongArrayDisk(location+"mapping_back_type_1",this.size);; - this.mappingType2 = new LongArrayDisk(location+"mapping_back_type_2",this.size);; - } + this.mapping2 = new LongArrayDisk(location+"mapping_back_2",this.size); + this.mappingType1 = new LongArrayDisk(location+"mapping_back_type_1",this.size); + this.mappingType2 = new LongArrayDisk(location+"mapping_back_type_2",this.size); + } public long size(){ return size; @@ -85,17 +84,11 @@ public void set(long i, int mapping, int type){ } @Override public void close() throws IOException { - if (this.mapping1 != null) { - this.mapping1.close(); - } - if (this.mapping2 != null){ - this.mapping2.close(); - } - if (this.mappingType1 != null) { - this.mappingType1.close(); - } - if (this.mappingType2 != null) { - this.mappingType2.close(); - } + IOUtil.closeAll( + mapping1, + mapping2, + mappingType1, + mappingType2 + ); } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatWrapper.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatWrapper.java index cec5d61c..0405f0e0 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatWrapper.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatWrapper.java @@ -14,10 +14,7 @@ public CatWrapper(Iterator sectionIter,String iterName){ @Override public boolean hasNext() { - if(sectionIter.hasNext()) - return true; - else - return false; + return sectionIter.hasNext(); } @Override diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java index 74891bdc..e243cd89 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java @@ -10,19 +10,21 @@ import org.rdfhdt.hdt.util.listener.ListenerUtil; import org.rdfhdt.hdt.util.string.ByteStringUtil; -import java.io.*; +import java.io.ByteArrayOutputStream; +import java.io.FileOutputStream; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.HashMap; +import java.util.Map; public class SectionUtil { private static final int DEFAULT_BLOCK_SIZE = 16; private static final int BLOCK_PER_BUFFER = 1000000; - public static void createSection(String location,long numEntries, int type, CatUnion itAdd , - CatUnion itSkip , HashMap mappings,long offset, ProgressListener listener) throws IOException { + public static void createSection(String location, long numEntries, int type, CatUnion itAdd , + CatUnion itSkip , Map mappings, long offset, ProgressListener listener) throws IOException { String name = ""; switch (type) { case 2: diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilDiff/DiffWrapper.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilDiff/DiffWrapper.java index 4441e4c1..be58746e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilDiff/DiffWrapper.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilDiff/DiffWrapper.java @@ -13,7 +13,7 @@ public class DiffWrapper implements Iterator { public final Iterator sectionIter; public final Bitmap bitmap; - public final String iterName; + public final CharSequence iterName; /** * create a diffWrapper of the iterator sectionIter with the bitmap bitmap @@ -22,7 +22,7 @@ public class DiffWrapper implements Iterator { * @param bitmap the bitmap to tell which element to keep * @param iterName the name of the section of the iterator */ - public DiffWrapper(Iterator sectionIter, Bitmap bitmap, String iterName) { + public DiffWrapper(Iterator sectionIter, Bitmap bitmap, CharSequence iterName) { this.sectionIter = sectionIter; this.bitmap = bitmap; this.iterName = iterName; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 24b037dd..355fcf67 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -1,29 +1,20 @@ package org.rdfhdt.hdt.hdt; import org.rdfhdt.hdt.compact.bitmap.Bitmap; -import org.rdfhdt.hdt.dictionary.DictionaryPrivate; -import org.rdfhdt.hdt.dictionary.impl.CompressFourSectionDictionary; import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; -import org.rdfhdt.hdt.enums.TripleComponentOrder; import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.ParserException; -import org.rdfhdt.hdt.hdt.impl.HDTBase; +import org.rdfhdt.hdt.hdt.impl.HDTDiskImporter; import org.rdfhdt.hdt.hdt.impl.HDTImpl; import org.rdfhdt.hdt.hdt.impl.TempHDTImporterOnePass; import org.rdfhdt.hdt.hdt.impl.TempHDTImporterTwoPass; -import org.rdfhdt.hdt.hdt.impl.WriteHDTImpl; -import org.rdfhdt.hdt.hdt.impl.diskimport.CompressTripleMapper; -import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; -import org.rdfhdt.hdt.hdt.impl.diskimport.SectionCompressor; -import org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult; import org.rdfhdt.hdt.hdt.writer.TripleWriterHDT; -import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.header.HeaderUtil; -import org.rdfhdt.hdt.iterator.utils.*; -import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.iterator.utils.FluxStopTripleStringIterator; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTOptionsKeys; @@ -33,19 +24,10 @@ import org.rdfhdt.hdt.rdf.RDFParserCallback; import org.rdfhdt.hdt.rdf.RDFParserFactory; import org.rdfhdt.hdt.rdf.TripleWriter; -import org.rdfhdt.hdt.triples.TempTriples; import org.rdfhdt.hdt.triples.TripleString; -import org.rdfhdt.hdt.triples.TriplesPrivate; import org.rdfhdt.hdt.util.BitUtil; import org.rdfhdt.hdt.util.Profiler; -import org.rdfhdt.hdt.util.StringUtil; -import org.rdfhdt.hdt.util.concurrent.KWayMerger; -import org.rdfhdt.hdt.util.io.CloseSuppressPath; import org.rdfhdt.hdt.util.io.IOUtil; -import org.rdfhdt.hdt.util.io.compress.MapCompressTripleMerger; -import org.rdfhdt.hdt.util.io.compress.TripleGenerator; -import org.rdfhdt.hdt.util.listener.IntermediateListener; -import org.rdfhdt.hdt.util.listener.ListenerUtil; import org.rdfhdt.hdt.util.listener.PrefixListener; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,7 +38,10 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; public class HDTManagerImpl extends HDTManager { private static final Logger logger = LoggerFactory.getLogger(HDTManagerImpl.class); @@ -71,6 +56,17 @@ public HDTOptions doReadOptions(String file) throws IOException { return new HDTSpecification(file); } + private HDT loadOrMapHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException { + String loadingMethod = spec.get(HDTOptionsKeys.LOAD_HDT_TYPE_KEY); + if (loadingMethod == null || loadingMethod.isEmpty() || HDTOptionsKeys.LOAD_HDT_TYPE_VALUE_MAP.equals(loadingMethod)) { + return doMapHDT(hdtFileName, listener, spec); + } + if (HDTOptionsKeys.LOAD_HDT_TYPE_VALUE_LOAD.equals(loadingMethod)) { + return doLoadHDT(hdtFileName, listener, spec); + } + throw new IllegalArgumentException("Bad loading method: " + loadingMethod); + } + @Override public HDT doLoadHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException { HDTPrivate hdt = new HDTImpl(spec); @@ -126,37 +122,28 @@ public HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException { private RDFFluxStop readFluxStopOrSizeLimit(HDTOptions spec) { // if no config, use default implementation - return Objects.requireNonNullElseGet( - RDFFluxStop.readConfig(spec.get(HDTOptionsKeys.RDF_FLUX_STOP_KEY)), - () -> { - // get the chunk size to base the work - String loaderType = spec.get(HDTOptionsKeys.LOADER_CATTREE_LOADERTYPE_KEY); - - if (!HDTOptionsKeys.LOADER_TYPE_VALUE_DISK.equals(loaderType)) { - // memory based implementation, we can only store the NT file - return RDFFluxStop.sizeLimit(getMaxChunkSize()); - } - - // disk based implementation, we only have to reduce the fault-factor of the map files - long chunkSize = findBestMemoryChunkDiskMapTreeCat(); - - String factorOpt = spec.get(HDTOptionsKeys.LOADER_CATTREE_MEMORY_FAULT_FACTOR); - double factor; - - if (factorOpt == null || factorOpt.isEmpty()) { - // default value - factor = 1.4; - } else { - factor = Double.parseDouble(factorOpt); - - if (factor <= 0) { - throw new IllegalArgumentException(HDTOptionsKeys.LOADER_CATTREE_MEMORY_FAULT_FACTOR + " can't have a negative or 0 value!"); - } - } - - // create a count limit from the chunk size / factor, set a minimum value for low factor - return RDFFluxStop.countLimit(Math.max(128, (long) (chunkSize * factor))); + return spec.getFluxStop(HDTOptionsKeys.RDF_FLUX_STOP_KEY, + () -> { + // get the chunk size to base the work + String loaderType = spec.get(HDTOptionsKeys.LOADER_CATTREE_LOADERTYPE_KEY); + + if (!HDTOptionsKeys.LOADER_TYPE_VALUE_DISK.equals(loaderType)) { + // memory based implementation, we can only store the NT file + return RDFFluxStop.sizeLimit(getMaxChunkSize()); } + + // disk based implementation, we only have to reduce the fault-factor of the map files + long triplesCount = findBestMemoryChunkDiskMapTreeCat(); + + double factor = spec.getDouble(HDTOptionsKeys.LOADER_CATTREE_MEMORY_FAULT_FACTOR, 1.4); + + if (factor <= 0) { + throw new IllegalArgumentException(HDTOptionsKeys.LOADER_CATTREE_MEMORY_FAULT_FACTOR + " can't have a negative or 0 value!"); + } + + // create a count limit from the chunk size / factor, set a minimum value for low factor + return RDFFluxStop.countLimit(Math.max(128, (long) (triplesCount * factor))); + } ); } @@ -276,17 +263,10 @@ public HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation, useSimple(hdtFormat)); // read the stream as triples try (PipedCopyIterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation)) { - return doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener); + return doGenerateHDTDisk0(iterator, true, baseURI, hdtFormat, listener); } } - /** - * @return a theoretical maximum amount of memory the JVM will attempt to use - */ - static long getMaxChunkSize(int workers) { - Runtime runtime = Runtime.getRuntime(); - return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85 / (1.5 * 3 * workers)); - } /** * @return a theoretical maximum amount of memory the JVM will attempt to use */ @@ -312,196 +292,16 @@ private static long findBestMemoryChunkDiskMapTreeCat() { @Override public HDT doGenerateHDTDisk(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener progressListener) throws IOException, ParserException { - MultiThreadListener listener = ListenerUtil.multiThreadListener(progressListener); - // load config - // compression mode - String compressMode = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY); // see CompressionResult - // worker for compression tasks - int workers = (int) hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY); - // maximum size of a chunk - long chunkSize = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY); - long maxFileOpenedLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_MAX_FILE_OPEN_KEY); - long kwayLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_KWAY_KEY); - long bufferSizeLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_BUFFER_SIZE_KEY); - int maxFileOpened; - int ways; - int bufferSize; - // location of the working directory, will be deleted after generation - String baseNameOpt = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY); - CloseSuppressPath basePath; - // location of the future HDT file, do not set to create the HDT in memory while mergin - String futureHDTLocation = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY); - - Profiler profiler = new Profiler("doGenerateHDTDisk", hdtFormat); - // check and set default values if required - if (workers == 0) { - workers = Runtime.getRuntime().availableProcessors(); - } else if (workers < 0) { - throw new IllegalArgumentException("Negative number of workers!"); - } - if (baseNameOpt == null || baseNameOpt.isEmpty()) { - basePath = CloseSuppressPath.of(Files.createTempDirectory("hdt-java-generate-disk")); - } else { - basePath = CloseSuppressPath.of(baseNameOpt); - } - basePath.closeWithDeleteRecurse(); - if (chunkSize == 0) { - chunkSize = getMaxChunkSize(workers); - } else if (chunkSize < 0) { - throw new IllegalArgumentException("Negative chunk size!"); - } - if (bufferSizeLong > Integer.MAX_VALUE - 5L || bufferSizeLong < 0) { - throw new IllegalArgumentException("Buffer size can't be negative or bigger than the size of an array!"); - } else if (bufferSizeLong == 0) { - bufferSize = CloseSuppressPath.BUFFER_SIZE; - } else { - bufferSize = (int) bufferSizeLong; - } - if (maxFileOpenedLong < 0 || maxFileOpenedLong > Integer.MAX_VALUE) { - throw new IllegalArgumentException("maxFileOpened can't be negative!"); - } else if (maxFileOpenedLong == 0) { - maxFileOpened = 1024; - } else { - maxFileOpened = (int) maxFileOpenedLong; - } - if (kwayLong < 0 || kwayLong > Integer.MAX_VALUE) { - throw new IllegalArgumentException("kway can't be negative!"); - } else if (kwayLong == 0) { - ways = Math.max(1, BitUtil.log2(maxFileOpened / workers)); - } else { - ways = (int) kwayLong; - } - boolean mapHDT = futureHDTLocation != null && !futureHDTLocation.isEmpty(); - - // create working directory - basePath.mkdirs(); - try { - // compress the triples into sections and compressed triples - listener.notifyProgress(0, "Sorting sections with chunk of size: " + StringUtil.humanReadableByteCount(chunkSize, true) + "B with " + ways + "ways and " + workers + " worker(s)"); - - AsyncIteratorFetcher source = new AsyncIteratorFetcher<>(iterator); - - profiler.pushSection("section compression"); - CompressionResult compressionResult; - try { - compressionResult = new SectionCompressor( - basePath.resolve("sectionCompression"), - source, - listener, - bufferSize, - chunkSize, 1 << ways - ).compress(workers, compressMode); - } catch (KWayMerger.KWayMergerException | InterruptedException e) { - throw new ParserException(e); - } - profiler.popSection(); - - HDTBase hdt; - if (!mapHDT) { - // using default implementation - hdt = new HDTImpl(hdtFormat); - } else { - // using map implementation - hdt = new WriteHDTImpl(hdtFormat, basePath.resolve("maphdt"), bufferSize); - } - hdt.setBaseUri(baseURI); - - listener.unregisterAllThreads(); - listener.notifyProgress(20, "Create sections and triple mapping"); - - profiler.pushSection("dictionary write"); - // create sections and triple mapping - DictionaryPrivate dictionary = hdt.getDictionary(); - CompressTripleMapper mapper = new CompressTripleMapper(basePath, compressionResult.getTripleCount(), chunkSize); - CompressFourSectionDictionary modifiableDictionary = new CompressFourSectionDictionary(compressionResult, mapper, listener); - try { - dictionary.loadAsync(modifiableDictionary, listener); - } catch (InterruptedException e) { - throw new ParserException(e); - } - profiler.popSection(); - - // complete the mapper with the shared count and delete compression data - compressionResult.delete(); - mapper.setShared(dictionary.getNshared()); - - listener.notifyProgress(40, "Create mapped and sort triple file"); - // create mapped triples file - TripleCompressionResult tripleCompressionResult; - TriplesPrivate triples = hdt.getTriples(); - TripleComponentOrder order = triples.getOrder(); - profiler.pushSection("triple compression/map"); - try { - MapCompressTripleMerger tripleMapper = new MapCompressTripleMerger( - basePath.resolve("tripleMapper"), - new AsyncIteratorFetcher<>(new TripleGenerator(compressionResult.getTripleCount())), - mapper, - listener, - order, - bufferSize, - chunkSize, - 1 << ways); - tripleCompressionResult = tripleMapper.merge(workers, compressMode); - } catch (KWayMerger.KWayMergerException | InterruptedException e) { - throw new ParserException(e); - } - profiler.popSection(); - listener.unregisterAllThreads(); - - profiler.pushSection("bit triple creation"); - try { - // create bit triples and load the triples - TempTriples tempTriples = tripleCompressionResult.getTriples(); - IntermediateListener il = new IntermediateListener(listener); - il.setRange(80, 90); - il.setPrefix("Create bit triples: "); - il.notifyProgress(0, "create triples"); - triples.load(tempTriples, il); - tempTriples.close(); - - // completed the triples, delete the mapper - mapper.delete(); - } finally { - tripleCompressionResult.close(); - } - profiler.popSection(); - profiler.pushSection("header creation"); - - listener.notifyProgress(90, "Create HDT header"); - // header - hdt.populateHeaderStructure(hdt.getBaseURI()); - hdt.getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, compressionResult.getRawSize()); + return doGenerateHDTDisk0(iterator, hdtFormat.getBoolean(HDTOptionsKeys.LOADER_DISK_NO_COPY_ITERATOR_KEY), baseURI, hdtFormat, progressListener); + } - profiler.popSection(); - // return the HDT - if (mapHDT) { - profiler.pushSection("map to hdt"); - // write the HDT and map it - try { - hdt.saveToHDT(futureHDTLocation, listener); - } finally { - hdt.close(); - } - IntermediateListener il = new IntermediateListener(listener); - il.setPrefix("Map HDT: "); - il.setRange(95, 100); - il.notifyProgress(0, "start"); - try { - return doMapHDT(futureHDTLocation, il, hdtFormat); - } finally { - profiler.popSection(); - } + private HDT doGenerateHDTDisk0(Iterator iterator, boolean copyIterator, String baseURI, HDTOptions hdtFormat, ProgressListener progressListener) throws IOException, ParserException { + try (HDTDiskImporter hdtDiskImporter = new HDTDiskImporter(hdtFormat, progressListener, baseURI)) { + if (copyIterator) { + return hdtDiskImporter.runAllSteps(iterator); } else { - listener.notifyProgress(100, "HDT completed"); - return hdt; - } - } finally { - try { - profiler.stop(); - profiler.writeProfiling(); - listener.notifyProgress(100, "Clearing disk"); - } finally { - basePath.close(); + // create a copy of the triple at loading time to avoid weird behaviors + return hdtDiskImporter.runAllSteps(new MapIterator<>(iterator, TripleString::tripleToString)); } } } @@ -518,8 +318,8 @@ protected TripleWriter doGetHDTWriter(String outFile, String baseURI, HDTOptions @Override public HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException { - try (HDT hdt1 = doMapHDT(hdtFileName1, listener, hdtFormat); - HDT hdt2 = doMapHDT(hdtFileName2, listener, hdtFormat)) { + try (HDT hdt1 = loadOrMapHDT(hdtFileName1, listener, hdtFormat); + HDT hdt2 = loadOrMapHDT(hdtFileName2, listener, hdtFormat)) { HDTImpl hdt = new HDTImpl(hdtFormat); if (hdt1.getDictionary() instanceof MultipleSectionDictionary && hdt2.getDictionary() instanceof MultipleSectionDictionary) { @@ -534,8 +334,8 @@ public HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, H @Override public HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException { - try (HDT hdt1 = doMapHDT(hdtFileName1, listener, hdtFormat); - HDT hdt2 = doMapHDT(hdtFileName2, listener, hdtFormat)) { + try (HDT hdt1 = loadOrMapHDT(hdtFileName1, listener, hdtFormat); + HDT hdt2 = loadOrMapHDT(hdtFileName2, listener, hdtFormat)) { HDTImpl hdt = new HDTImpl(hdtFormat); hdt.diff(hdt1, hdt2, listener); return hdt; @@ -544,9 +344,17 @@ public HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFor @Override protected HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBitmap, HDTOptions hdtFormat, ProgressListener listener) throws IOException { - try (HDT hdtOriginal = doMapHDT(hdtFileName, listener, hdtFormat)) { + try (HDT hdtOriginal = loadOrMapHDT(hdtFileName, listener, hdtFormat)) { HDTImpl hdt = new HDTImpl(hdtFormat); - hdt.diffBit(location, hdtOriginal, deleteBitmap, listener); + try { + hdt.diffBit(location, hdtOriginal, deleteBitmap, listener); + } catch (Throwable t) { + try { + throw t; + } finally { + hdt.close(); + } + } return hdt; } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java new file mode 100644 index 00000000..64d3e5fa --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java @@ -0,0 +1,364 @@ +package org.rdfhdt.hdt.hdt.impl; + +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.dictionary.impl.CompressFourSectionDictionary; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressTripleMapper; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.hdt.impl.diskimport.SectionCompressor; +import org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult; +import org.rdfhdt.hdt.header.HeaderPrivate; +import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.TriplesPrivate; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.Profiler; +import org.rdfhdt.hdt.util.StringUtil; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.concurrent.KWayMerger; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.compress.MapCompressTripleMerger; +import org.rdfhdt.hdt.util.io.compress.TripleGenerator; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Files; +import java.util.Iterator; + +/** + * HDT Disk generation methods + * + * @author Antoine Willerval + */ +public class HDTDiskImporter implements Closeable { + /** + * @return a theoretical maximum amount of memory the JVM will attempt to use + */ + static long getMaxChunkSize(int workers) { + Runtime runtime = Runtime.getRuntime(); + return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85 / (1.5 * 3 * workers)); + } + + // configs + private final HDTOptions hdtFormat; + private final MultiThreadListener listener; + private final String compressMode; + private final String futureHDTLocation; + private final CloseSuppressPath basePath; + private final long chunkSize; + private final int ways; + private final int workers; + private final int bufferSize; + private final boolean mapHDT; + private final boolean debugHDTBuilding; + private final Profiler profiler; + private final HDTBase hdt; + private long rawSize; + + // component status + private boolean dict = false; + private boolean header = false; + private boolean triples = false; + + public HDTDiskImporter(HDTOptions hdtFormat, ProgressListener progressListener, String baseURI) throws IOException { + this.hdtFormat = hdtFormat; + listener = ListenerUtil.multiThreadListener(progressListener); + // load config + + // compression mode + compressMode = hdtFormat.get( + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY, + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE + ); // see CompressionResult + // worker for compression tasks + workers = (int) hdtFormat.getInt( + HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY, + Runtime.getRuntime()::availableProcessors + ); + // check and set default values if required + if (workers <= 0) { + throw new IllegalArgumentException("Number of workers should be positive!"); + } + // maximum size of a chunk + chunkSize = hdtFormat.getInt( + HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY, + () -> getMaxChunkSize(this.workers) + ); + if (chunkSize < 0) { + throw new IllegalArgumentException("Negative chunk size!"); + } + long maxFileOpenedLong = hdtFormat.getInt( + HDTOptionsKeys.LOADER_DISK_MAX_FILE_OPEN_KEY, + 1024 + ); + int maxFileOpened; + if (maxFileOpenedLong < 0 || maxFileOpenedLong > Integer.MAX_VALUE) { + throw new IllegalArgumentException("maxFileOpened should be positive!"); + } else { + maxFileOpened = (int) maxFileOpenedLong; + } + long kwayLong = hdtFormat.getInt( + HDTOptionsKeys.LOADER_DISK_KWAY_KEY, + () -> Math.max(1, BitUtil.log2(maxFileOpened / this.workers)) + ); + if (kwayLong <= 0 || kwayLong > Integer.MAX_VALUE) { + throw new IllegalArgumentException("kway can't be negative!"); + } else { + ways = (int) kwayLong; + } + + long bufferSizeLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_BUFFER_SIZE_KEY, CloseSuppressPath.BUFFER_SIZE); + if (bufferSizeLong > Integer.MAX_VALUE - 5L || bufferSizeLong <= 0) { + throw new IllegalArgumentException("Buffer size can't be negative or bigger than the size of an array!"); + } else { + bufferSize = (int) bufferSizeLong; + } + // location of the working directory, will be deleted after generation + String baseNameOpt = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY); + // location of the future HDT file, do not set to create the HDT in memory while mergin + futureHDTLocation = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY); + + profiler = new Profiler("doGenerateHDTDisk", hdtFormat); + if (baseNameOpt == null || baseNameOpt.isEmpty()) { + basePath = CloseSuppressPath.of(Files.createTempDirectory("hdt-java-generate-disk")); + } else { + basePath = CloseSuppressPath.of(baseNameOpt); + } + basePath.closeWithDeleteRecurse(); + mapHDT = futureHDTLocation != null && !futureHDTLocation.isEmpty(); + // debug the build strategy + debugHDTBuilding = hdtFormat.getBoolean("debug.disk.build"); + + // create working directory + basePath.mkdirs(); + + if (!mapHDT) { + // using default implementation + hdt = new HDTImpl(hdtFormat); + } else { + // using map implementation + hdt = new WriteHDTImpl(hdtFormat, basePath.resolve("maphdt"), bufferSize); + } + hdt.setBaseUri(baseURI); + } + + /** + * create the Dictonary of the HDT + * + * @param iterator the triples stream to create the dictionary + * @throws java.lang.IllegalArgumentException is the component was already built + */ + public CompressTripleMapper compressDictionary(Iterator iterator) throws ParserException, IOException { + if (this.dict) { + throw new IllegalArgumentException("Dictionary already built! Use another importer instance!"); + } + listener.notifyProgress(0, "Sorting sections with chunk of size: " + StringUtil.humanReadableByteCount(chunkSize, true) + "B with " + ways + "ways and " + workers + " worker(s)"); + + AsyncIteratorFetcher source = new AsyncIteratorFetcher<>(iterator); + + profiler.pushSection("section compression"); + CompressionResult compressionResult; + try { + compressionResult = new SectionCompressor( + basePath.resolve("sectionCompression"), + source, + listener, + bufferSize, + chunkSize, 1 << ways, + hdtFormat.getBoolean("debug.disk.slow.stream2")).compress(workers, compressMode); + } catch (KWayMerger.KWayMergerException | InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + + listener.unregisterAllThreads(); + listener.notifyProgress(20, "Create sections and triple mapping"); + + profiler.pushSection("dictionary write"); + // create sections and triple mapping + DictionaryPrivate dictionary = hdt.getDictionary(); + CompressTripleMapper mapper = new CompressTripleMapper(basePath, compressionResult.getTripleCount(), chunkSize); + try (CompressFourSectionDictionary modifiableDictionary = new CompressFourSectionDictionary(compressionResult, mapper, listener, debugHDTBuilding)) { + dictionary.loadAsync(modifiableDictionary, listener); + } catch (InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + + // complete the mapper with the shared count and delete compression data + compressionResult.delete(); + rawSize = compressionResult.getRawSize(); + mapper.setShared(dictionary.getNshared()); + + this.dict = true; + return mapper; + } + + /** + * create the Triples of the HDT + * + * @param mapper the mapper from the dictionary building + * @throws java.lang.IllegalArgumentException is the component was already built + */ + public void compressTriples(CompressTripleMapper mapper) throws ParserException, IOException { + if (this.triples) { + throw new IllegalArgumentException("Triples already built! Use another importer instance!"); + } + listener.notifyProgress(40, "Create mapped and sort triple file"); + // create mapped triples file + TripleCompressionResult tripleCompressionResult; + TriplesPrivate triples = hdt.getTriples(); + TripleComponentOrder order = triples.getOrder(); + profiler.pushSection("triple compression/map"); + try { + MapCompressTripleMerger tripleMapper = new MapCompressTripleMerger( + basePath.resolve("tripleMapper"), + new AsyncIteratorFetcher<>(new TripleGenerator(mapper.getTripleCount())), + mapper, + listener, + order, + bufferSize, + chunkSize, + 1 << ways); + tripleCompressionResult = tripleMapper.merge(workers, compressMode); + } catch (KWayMerger.KWayMergerException | InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + listener.unregisterAllThreads(); + + profiler.pushSection("bit triple creation"); + try { + // create bit triples and load the triples + TempTriples tempTriples = tripleCompressionResult.getTriples(); + IntermediateListener il = new IntermediateListener(listener); + il.setRange(80, 90); + il.setPrefix("Create bit triples: "); + il.notifyProgress(0, "create triples"); + triples.load(tempTriples, il); + tempTriples.close(); + + // completed the triples, delete the mapper + mapper.delete(); + } finally { + tripleCompressionResult.close(); + } + profiler.popSection(); + + this.triples = true; + } + + /** + * create the Header of the HDT + * + * @throws java.lang.IllegalArgumentException is the component was already built + */ + public void createHeader() { + if (this.header) { + throw new IllegalArgumentException("Header already built! Use another importer instance!"); + } + profiler.pushSection("header creation"); + + listener.notifyProgress(90, "Create HDT header"); + // header + hdt.populateHeaderStructure(hdt.getBaseURI()); + hdt.getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, rawSize); + + profiler.popSection(); + + this.header = true; + } + + /** + * create the HDT from the parts of the previous methods + * + * @return hdt + * @throws IOException io exception during disk generation + * @throws java.lang.IllegalArgumentException is a component is missing + */ + public HDT convertToHDT() throws IOException { + if (!this.dict) { + throw new IllegalArgumentException("Dictionary missing, can't create HDT"); + } + if (!this.triples) { + throw new IllegalArgumentException("Triples missing, can't create HDT"); + } + if (!this.header) { + throw new IllegalArgumentException("Header missing, can't create HDT"); + } + // return the HDT + if (mapHDT) { + profiler.pushSection("map to hdt"); + // write the HDT and map it + try { + hdt.saveToHDT(futureHDTLocation, listener); + } finally { + hdt.close(); + } + IntermediateListener il = new IntermediateListener(listener); + il.setPrefix("Map HDT: "); + il.setRange(95, 100); + il.notifyProgress(0, "start"); + try { + return HDTManager.mapHDT(futureHDTLocation, il, hdtFormat); + } finally { + profiler.popSection(); + } + } else { + listener.notifyProgress(100, "HDT completed"); + return hdt; + } + } + + /** + * call all the step to create an HDT + * + * @param iterator the iterator to load the data + * @return hdt + * @throws IOException io exception during disk generation + * @throws ParserException parsing exception during disk generation + * @throws java.lang.IllegalArgumentException is a component is missing + */ + public HDT runAllSteps(Iterator iterator) throws IOException, ParserException { + // compress the triples into sections and compressed triples + ExceptionThread.pushDebugPoint(); + + CompressTripleMapper mapper = compressDictionary(iterator); + + ExceptionThread.popDebugPoint("end compress dict"); + + + ExceptionThread.pushDebugPoint(); + + compressTriples(mapper); + + ExceptionThread.popDebugPoint("end compress triple"); + + + createHeader(); + + return convertToHDT(); + } + + @Override + public void close() throws IOException { + try { + profiler.stop(); + profiler.writeProfiling(); + listener.notifyProgress(100, "Clearing disk"); + } finally { + basePath.close(); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java index 458ffdbd..83663093 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java @@ -52,6 +52,7 @@ import org.rdfhdt.hdt.hdt.HDTVersion; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.hdt.TempHDT; +import org.rdfhdt.hdt.header.Header; import org.rdfhdt.hdt.header.HeaderFactory; import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.iterator.DictionaryTranslateIterator; @@ -60,6 +61,7 @@ import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.triples.DictionaryEntriesDiff; import org.rdfhdt.hdt.triples.IteratorTripleID; @@ -74,6 +76,7 @@ import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorCat; import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorDiff; import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorMapDiff; +import org.rdfhdt.hdt.util.LiteralsUtils; import org.rdfhdt.hdt.util.StopWatch; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; @@ -92,7 +95,6 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.Iterator; import java.util.Map; import java.util.zip.GZIPInputStream; @@ -249,10 +251,10 @@ public void mapFromHDT(File f, long offset, ProgressListener listener) throws IO */ @Override public void saveToHDT(String fileName, ProgressListener listener) throws IOException { - OutputStream out = new BufferedOutputStream(new FileOutputStream(fileName)); - //OutputStream out = new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(fileName))); - saveToHDT(out, listener); - out.close(); + try (OutputStream out = new BufferedOutputStream(new FileOutputStream(fileName))) { + //OutputStream out = new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(fileName))); + saveToHDT(out, listener); + } this.hdtFileName = fileName; } @@ -470,10 +472,10 @@ public boolean isMapped() { /** * Merges two hdt files hdt1 and hdt2 on disk at location - * @param location - * @param hdt1 - * @param hdt2 - * @param listener + * @param location catlocation + * @param hdt1 hdt1 + * @param hdt2 hdt2 + * @param listener listener */ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { if (listener != null) { @@ -538,6 +540,28 @@ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) } this.header = HeaderFactory.createHeader(spec); this.populateHeaderStructure(hdt1.getBaseURI()); + long rawSize1 = getRawSize(hdt1.getHeader()); + long rawSize2 = getRawSize(hdt2.getHeader()); + + if (rawSize1 != -1 && rawSize2 != -1) { + getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, String.valueOf(rawSize1 + rawSize2)); + } + } + + public static long getRawSize(Header header) { + + try { + IteratorTripleString rawSize1 = header.search("_:statistics", HDTVocabulary.ORIGINAL_SIZE, ""); + if (!rawSize1.hasNext()) { + return -1; + } + + CharSequence obj = rawSize1.next().getObject(); + // remove "s in "" + return Long.parseLong(obj, 1, obj.length() - 1, 10); + } catch (NotFoundException e) { + return -1; + } } public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { @@ -550,7 +574,8 @@ public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener list ControlInfo ci2 = new ControlInformation(); try (CountInputStream fis = new CountInputStream(new BufferedInputStream(new FileInputStream(location + "dictionary")))) { HDTSpecification spec = new HDTSpecification(); - spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); MultipleSectionDictionaryBig dictionary = new MultipleSectionDictionaryBig(spec); fis.mark(1024); ci2.load(fis); @@ -567,26 +592,22 @@ public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener list bitmapTriplesCat.cat(it,listener); } //Delete the mappings since they are not necessary anymore - Iterator> iter = hdt1.getDictionary().getAllObjects().entrySet().iterator(); int countSubSections = 0; - while (iter.hasNext()){ - Map.Entry entry = iter.next(); - String dataType = entry.getKey(); - String prefix = "sub"+countSubSections; - if(dataType.equals("NO_DATATYPE")) - prefix = dataType; + for (CharSequence datatype : hdt1.getDictionary().getAllObjects().keySet()) { + String prefix = "sub" + countSubSections; + if(datatype.equals(LiteralsUtils.NO_DATATYPE)) { + prefix = datatype.toString(); + } Files.delete(Paths.get(location+prefix+"1")); Files.delete(Paths.get(location+prefix+"1"+"Types")); countSubSections++; } - iter = hdt2.getDictionary().getAllObjects().entrySet().iterator(); countSubSections = 0; - while (iter.hasNext()){ - Map.Entry entry = iter.next(); - String dataType = entry.getKey(); + for (CharSequence datatype : hdt2.getDictionary().getAllObjects().keySet()){ String prefix = "sub"+countSubSections; - if(dataType.equals("NO_DATATYPE")) - prefix = dataType; + if(datatype.equals(LiteralsUtils.NO_DATATYPE)) { + prefix = datatype.toString(); + } Files.delete(Paths.get(location+prefix+"2")); Files.delete(Paths.get(location+prefix+"2"+"Types")); countSubSections++; @@ -626,6 +647,12 @@ public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener list } this.header = HeaderFactory.createHeader(spec); this.populateHeaderStructure(hdt1.getBaseURI()); + long rawSize1 = getRawSize(hdt1.getHeader()); + long rawSize2 = getRawSize(hdt2.getHeader()); + + if (rawSize1 != -1 && rawSize2 != -1) { + getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, String.valueOf(rawSize1 + rawSize2)); + } } public void diff(HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { @@ -644,7 +671,7 @@ public void diffBit(String location, HDT hdt, Bitmap deleteBitmap, ProgressListe iter.loadBitmaps(); - Map bitmaps = iter.getBitmaps(); + Map bitmaps = iter.getBitmaps(); try (DictionaryDiff diff = DictionaryFactory.createDictionaryDiff(hdt.getDictionary(), location)) { @@ -674,10 +701,11 @@ public void diffBit(String location, HDT hdt, Bitmap deleteBitmap, ProgressListe il.notifyProgress(80, "Clear data..."); if(!(hdt.getDictionary() instanceof FourSectionDictionary)) { int count = 0; - for (Map.Entry next : dictionary.getAllObjects().entrySet()) { - String subPrefix = "sub" + count; - if(next.getKey().equals("NO_DATATYPE")) - subPrefix = next.getKey(); + for (CharSequence key : dictionary.getAllObjects().keySet()) { + CharSequence subPrefix = "sub" + count; + if(key.equals(LiteralsUtils.NO_DATATYPE)) { + subPrefix = key; + } Files.delete(Paths.get(location + subPrefix)); Files.delete(Paths.get(location + subPrefix + "Types")); count++; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java index a1f13170..76d6599c 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java @@ -1,11 +1,16 @@ package org.rdfhdt.hdt.hdt.impl; +import org.rdfhdt.hdt.dictionary.DictionaryFactory; +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; import org.rdfhdt.hdt.dictionary.impl.WriteFourSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.WriteMultipleSectionDictionary; import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.header.HeaderFactory; import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.IteratorTripleString; import org.rdfhdt.hdt.triples.impl.WriteBitmapTriples; import org.rdfhdt.hdt.util.io.CloseSuppressPath; @@ -24,7 +29,7 @@ * * @author Antoine Willerval */ -public class WriteHDTImpl extends HDTBase { +public class WriteHDTImpl extends HDTBase { private String baseURI; private final CloseSuppressPath workingLocation; private boolean isClosed; @@ -34,7 +39,7 @@ public WriteHDTImpl(HDTOptions spec, CloseSuppressPath workingLocation, int buff this.workingLocation = workingLocation; workingLocation.mkdirs(); - dictionary = new WriteFourSectionDictionary(this.spec, workingLocation.resolve("section"), bufferSize); + dictionary = DictionaryFactory.createWriteDictionary(this.spec, workingLocation.resolve("section"), bufferSize); // we need to have the bitmaps in memory, so we can't bypass the implementation triples = new WriteBitmapTriples(this.spec, workingLocation.resolve("tripleBitmap"), bufferSize); // small, can use default implementation diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java index d8c4506d..c0c5b5de 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java @@ -27,8 +27,10 @@ public class CompressTripleMapper implements CompressFourSectionDictionary.NodeC private final CloseSuppressPath locationPredicates; private final CloseSuppressPath locationObjects; private long shared = -1; + private final long tripleCount; public CompressTripleMapper(CloseSuppressPath location, long tripleCount, long chunkSize) { + this.tripleCount = tripleCount; locationSubjects = location.resolve("map_subjects"); locationPredicates = location.resolve("map_predicates"); locationObjects = location.resolve("map_objects"); @@ -131,4 +133,8 @@ private long extract(LongArray array, long id) { // compute shared if required return CompressUtil.computeSharedNode(array.get(id), shared); } + + public long getTripleCount() { + return tripleCount; + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java index b9dcaba4..c9f851ae 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java @@ -2,6 +2,7 @@ import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; import org.rdfhdt.hdt.iterator.utils.IndexNodeDeltaMergeExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; import org.rdfhdt.hdt.iterator.utils.SizeFetcher; import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.triples.IndexedNode; @@ -41,21 +42,22 @@ public class SectionCompressor implements KWayMerger.KWayMergerImpl source; - private boolean done; private final MultiThreadListener listener; private final AtomicLong triples = new AtomicLong(); private final AtomicLong ntRawSize = new AtomicLong(); private final int bufferSize; private final long chunkSize; private final int k; + private final boolean debugSleepKwayDict; - public SectionCompressor(CloseSuppressPath baseFileName, AsyncIteratorFetcher source, MultiThreadListener listener, int bufferSize, long chunkSize, int k) { + public SectionCompressor(CloseSuppressPath baseFileName, AsyncIteratorFetcher source, MultiThreadListener listener, int bufferSize, long chunkSize, int k, boolean debugSleepKwayDict) { this.source = source; this.listener = listener; this.baseFileName = baseFileName; this.bufferSize = bufferSize; this.chunkSize = chunkSize; this.k = k; + this.debugSleepKwayDict = debugSleepKwayDict; } /* @@ -188,6 +190,14 @@ public void createChunk(SizeFetcher fetcher, CloseSuppressPath out TripleString next; while ((next = fetcher.get()) != null) { + if (debugSleepKwayDict) { + try { + Thread.sleep(25); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + // load the map triple and write it in the writer long tripleID = triples.incrementAndGet(); @@ -441,8 +451,8 @@ private void computeSection(List triples, String section, int start, } // section - try (OutputStream output = openW.get()) { - CompressUtil.writeCompressedSection(IndexNodeDeltaMergeExceptionIterator.buildOfTree(readers), size, output, il); + try (OutputStream output = openW.get()) { // IndexNodeDeltaMergeExceptionIterator + CompressUtil.writeCompressedSection(CompressNodeMergeIterator.buildOfTree(readers), size, output, il); } } finally { if (async) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/writer/TripleWriterHDT.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/writer/TripleWriterHDT.java index 28595d6f..4eb68b99 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/writer/TripleWriterHDT.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/writer/TripleWriterHDT.java @@ -24,7 +24,7 @@ public class TripleWriterHDT implements TripleWriter { - private OutputStream out; + private final OutputStream out; private boolean close=false; HDTOptions spec; String baseUri; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/writer/TripleWriterNtriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/writer/TripleWriterNtriples.java index 5804d799..173052a3 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/writer/TripleWriterNtriples.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/writer/TripleWriterNtriples.java @@ -15,7 +15,7 @@ public class TripleWriterNtriples implements TripleWriter { - private Writer out; + private final Writer out; private boolean close=false; public TripleWriterNtriples(String outFile, boolean compress) throws IOException { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIterator.java index da929298..b2c99ae8 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIterator.java @@ -327,27 +327,27 @@ public void fetchNext() throws E { if (deltaCompare < 0) { // node1 > node2 -> send node2 next = last2; + pivot = -deltaCompare - 1; if (!send1) { // the last send was the send1, we can send the real delta delta = delta2; } else { // not the same, we need to compare to get the new delta - delta = Math.min(delta2, delta); + delta = 0; } - pivot = -deltaCompare - 1; last2 = null; send1 = false; } else { // node1 < node2 -> send node1 next = last1; + pivot = deltaCompare - 1; if (send1) { // the last send was the send2, we can send the real delta delta = delta1; } else { // not the same, we need to compare to get the new delta - delta = Math.min(delta1, delta); + delta = 0; } - pivot = deltaCompare - 1; last1 = null; send1 = true; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ListTripleIDIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ListTripleIDIterator.java index 5e32f91c..87593549 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ListTripleIDIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ListTripleIDIterator.java @@ -15,7 +15,7 @@ */ public class ListTripleIDIterator implements IteratorTripleID { - private List triplesList; + private final List triplesList; private int pos; private long lastPosition; @@ -39,7 +39,7 @@ public boolean hasNext() { @Override public TripleID next() { lastPosition = pos; - return triplesList.get((int)pos++); + return triplesList.get(pos++); } /* (non-Javadoc) @@ -55,7 +55,7 @@ public boolean hasPrevious() { */ @Override public TripleID previous() { - TripleID tripleID = triplesList.get((int)--pos); + TripleID tripleID = triplesList.get(--pos); lastPosition = pos; return tripleID; } @@ -117,4 +117,4 @@ public long getLastTriplePosition() { public void remove() { throw new UnsupportedOperationException(); } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java index 85b6ca8a..46610a89 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java @@ -161,6 +161,8 @@ public void closePipe() { public void closePipe(Throwable e) { if (e != null) { + // clear the queue to force the exception + queue.clear(); if (e instanceof PipedIteratorException) { this.exception = (PipedIteratorException) e; } else { @@ -217,6 +219,13 @@ public void attachThread(Thread thread) { this.thread = thread; } + /** + * Allow receiving again elements after an end node + */ + public void reset() { + this.end = false; + } + @Override public void close() throws IOException { if (thread != null) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/RepeatApplyIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/RepeatApplyIterator.java index 425a077f..6413871f 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/RepeatApplyIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/RepeatApplyIterator.java @@ -6,7 +6,7 @@ public abstract class RepeatApplyIterator implements Iterator, Closeable { - private Iterator input ; + private final Iterator input ; private boolean finished = false ; private Iterator currentStage = null ; @@ -59,4 +59,4 @@ public final void remove() public void close() { } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SeveralIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SeveralIterator.java index 7a3853c4..a4117613 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SeveralIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SeveralIterator.java @@ -17,7 +17,7 @@ */ public class SeveralIterator extends PrefetchIterator { public interface IteratorGetter { - public Iterator get(K k); + Iterator get(K k); } IteratorGetter getter; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SideEffect.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SideEffect.java index 3f278d42..e3978736 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SideEffect.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/SideEffect.java @@ -1,5 +1,5 @@ package org.rdfhdt.hdt.iterator.utils; public interface SideEffect { - public void call(T param); + void call(T param); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HDTOptionsBase.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HDTOptionsBase.java index b16fa5c9..4e28a1ac 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HDTOptionsBase.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HDTOptionsBase.java @@ -44,11 +44,6 @@ public HDTOptionsBase() { properties = new Properties(); } - /** - * Gets a property value from a property key - * - * @param key - */ @Override public String get(String key) { return properties.getProperty(key); @@ -71,12 +66,9 @@ public void setOptions(String options) { } } - /** - * @param string - */ @Override - public long getInt(String string) { - String val = properties.getProperty(string.trim()); + public long getInt(String key) { + String val = properties.getProperty(key.trim()); if(val!=null) { return Long.parseLong(val); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRAR.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRAR.java index b32f1acb..d5c65550 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRAR.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRAR.java @@ -21,9 +21,6 @@ * * It uses RDFNotation.guess() to guess the format of each specific file. If not recognised, each file of the tar is ignored. * - * - * @author - * */ public class RDFParserRAR implements RDFParserCallback { @@ -32,7 +29,7 @@ public class RDFParserRAR implements RDFParserCallback { private final static String [] cmdList = { "unrar", "vb" , ""}; private final static String [] cmdExtractFile = { "unrar", "p", "-inul", "", "" }; private static Boolean available; - private boolean simple; + private final boolean simple; public RDFParserRAR(boolean simple) { this.simple = simple; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java index e9a3862f..8bd99295 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java @@ -61,7 +61,7 @@ private void parse(InputStream stream, String baseUri, Lang lang, boolean keepBN } private RDFCallback callback; - private TripleString triple = new TripleString(); + private final TripleString triple = new TripleString(); /* (non-Javadoc) * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.Callback) diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/DictionaryEntriesDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/DictionaryEntriesDiff.java index e974d9c4..3007eb8c 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/DictionaryEntriesDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/DictionaryEntriesDiff.java @@ -39,7 +39,7 @@ static DictionaryEntriesDiff createForType(Dictionary dictionary, HDT hdt, Bitma /** * @return the bitmaps */ - Map getBitmaps(); + Map getBitmaps(); /** * create the bitmaps diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TripleIDComparatorInt.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TripleIDComparatorInt.java index 2b6e0a57..98794b47 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TripleIDComparatorInt.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TripleIDComparatorInt.java @@ -39,7 +39,7 @@ public class TripleIDComparatorInt implements Comparator { /** Determines the order of comparison */ - private TripleComponentOrder order; + private final TripleComponentOrder order; public static Comparator getComparator(TripleComponentOrder order) { return new TripleIDComparatorInt(order); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java index 34043379..f07224d0 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java @@ -486,7 +486,7 @@ class Pair { Long valueY; Long positionY; @Override public String toString() { return String.format("%d %d", valueY,positionY); } - }; + } // FIXME: Sort directly without copying? ArrayList list=new ArrayList((int)listLen); @@ -555,8 +555,8 @@ private void createIndexObjects() { class Pair { int valueY; int positionY; - }; - + } + ArrayList> list=new ArrayList>(); System.out.println("Generating HDT Index for ?PO, and ??O queries."); @@ -611,11 +611,7 @@ public int compare(Pair o1, Pair o2) { for(int j=0;j bitmaps; + private final Map bitmaps; private long count; public FourSectionDictionaryEntriesDiff(HDT hdtOriginal, Bitmap deleteBitmap, IteratorTripleID iterator) { @@ -76,7 +76,7 @@ public long getCount() { } @Override - public Map getBitmaps() { + public Map getBitmaps() { return bitmaps; } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/MultipleSectionDictionaryEntriesDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/MultipleSectionDictionaryEntriesDiff.java index 33dcca4a..30846376 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/MultipleSectionDictionaryEntriesDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/MultipleSectionDictionaryEntriesDiff.java @@ -12,7 +12,6 @@ import java.util.HashMap; import java.util.Map; -import java.util.TreeMap; /** * Implementation of the {@link DictionaryEntriesDiff} for multiple section dictionaries @@ -22,8 +21,8 @@ public class MultipleSectionDictionaryEntriesDiff implements DictionaryEntriesDi private final HDT hdtOriginal; private final IteratorTripleID iterator; private final Bitmap bitArrayDisk; - private final HashMap bitmaps; - private final HashMap objectsOffsets; + private final Map bitmaps; + private final Map objectsOffsets; private long count; public MultipleSectionDictionaryEntriesDiff(HDT hdtOriginal, Bitmap deleteBitmap, IteratorTripleID iterator) { @@ -46,9 +45,9 @@ public void loadBitmaps() { this.bitmaps.put("S", BitmapFactory.createRWBitmap(dict.getSubjects().getNumberOfElements())); // create bitmaps for all objects - TreeMap allObjects = dict.getAllObjects(); + Map allObjects = dict.getAllObjects(); long count = 0; - for (Map.Entry next : allObjects.entrySet()) { + for (Map.Entry next : allObjects.entrySet()) { this.bitmaps.put(next.getKey(), BitmapFactory.createRWBitmap(next.getValue().getNumberOfElements())); objectsOffsets.put(next.getKey(), count); count += next.getValue().getNumberOfElements(); @@ -76,7 +75,7 @@ public void loadBitmaps() { if (objId <= numShared) { this.bitmaps.get("SH_O").set(objId - 1, true); } else { - String dataType = this.hdtOriginal.getDictionary().dataTypeOfId(objId); + CharSequence dataType = this.hdtOriginal.getDictionary().dataTypeOfId(objId); long numObjectsBefore = objectsOffsets.get(dataType); this.bitmaps.get(dataType).set(objId - numObjectsBefore - numShared - 1, true); } @@ -90,7 +89,7 @@ public long getCount() { } @Override - public HashMap getBitmaps() { + public Map getBitmaps() { return bitmaps; } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/PredicateIndex.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/PredicateIndex.java index 32a47af6..fb3090b3 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/PredicateIndex.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/PredicateIndex.java @@ -9,15 +9,15 @@ import org.rdfhdt.hdt.util.io.CountInputStream; public interface PredicateIndex { - public long getNumOcurrences(long pred); - public long getBase(long pred); - public long getOccurrence(long base, long occ); + long getNumOcurrences(long pred); + long getBase(long pred); + long getOccurrence(long base, long occ); - public void load(InputStream in) throws IOException; + void load(InputStream in) throws IOException; - public void save(OutputStream in) throws IOException; + void save(OutputStream in) throws IOException; - public void mapIndex(CountInputStream input, File f, ProgressListener listener) throws IOException; + void mapIndex(CountInputStream input, File f, ProgressListener listener) throws IOException; void generate(ProgressListener listener); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TripleIDInt.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TripleIDInt.java index 33b77357..09156c91 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TripleIDInt.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TripleIDInt.java @@ -72,7 +72,7 @@ public TripleIDInt(long subject, long predicate, long object) { /** * Build a TripleID as a copy of another one. - * @param other + * @param other other */ public TripleIDInt(TripleIDInt other) { super(); @@ -134,9 +134,9 @@ public void setPredicate(int predicate) { /** * Replace all components of a TripleID at once. Useful to reuse existing objects. - * @param subject - * @param predicate - * @param object + * @param subject subject + * @param predicate predicate + * @param object object */ public void setAll(int subject, int predicate, int object) { this.subject = subject; @@ -208,9 +208,7 @@ public boolean match(TripleID pattern) { /* Remember that 0 acts as a wildcard */ if (subjectPattern == 0 || this.subject == subjectPattern) { if (predicatePattern == 0 || this.predicate == predicatePattern) { - if (objectPattern == 0 || this.object == objectPattern) { - return true; - } + return objectPattern == 0 || this.object == objectPattern; } } return false; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesListLong.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesListLong.java index 84ef75f5..22f55e7b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesListLong.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/TriplesListLong.java @@ -409,7 +409,7 @@ public void close() throws IOException { */ public class TriplesListIterator implements SuppliableIteratorTripleID { private long lastPosition; - private TriplesListLong triplesList; + private final TriplesListLong triplesList; private int pos; public TriplesListIterator(TriplesListLong triplesList) { @@ -431,7 +431,7 @@ public boolean hasNext() { @Override public TripleID next() { lastPosition = pos; - return triplesList.arrayOfTriples.get((int)pos++); + return triplesList.arrayOfTriples.get(pos++); } /* (non-Javadoc) diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java index bdadc045..b6decac2 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java @@ -1,15 +1,15 @@ package org.rdfhdt.hdt.util; -import java.util.HashMap; import java.util.Iterator; +import java.util.Map; public class CustomIterator implements Iterator { public CharSequence prev = ""; boolean first = true; Iterator iter; - HashMap literalsCounts; + Map literalsCounts; private long currCount; - public CustomIterator(Iterator iter, HashMap literalsCounts) { + public CustomIterator(Iterator iter, Map literalsCounts) { this.iter = iter; this.literalsCounts = literalsCounts; if(iter.hasNext()) { @@ -42,12 +42,10 @@ public boolean hasNext() { public CharSequence next() { if(first) { first = false; - return LiteralsUtils.removeType(prev); - } - else { + } else { prev = iter.next(); currCount--; - return LiteralsUtils.removeType(prev); } + return LiteralsUtils.removeType(prev); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java index 2b8d1a61..4353131d 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java @@ -1,65 +1,348 @@ package org.rdfhdt.hdt.util; -import org.apache.jena.graph.Node; -import org.rdfhdt.hdt.rdf.parsers.JenaNodeCreator; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.CompactString; +import org.rdfhdt.hdt.util.string.DelayedString; +import org.rdfhdt.hdt.util.string.ReplazableString; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.ConcurrentModificationException; public class LiteralsUtils { - static Pattern pattern = Pattern.compile("\".*\"\\^\\^<.*>"); - - - public static boolean containsLanguage(String str){ - Node node = JenaNodeCreator.createLiteral(str); - String lang = node.getLiteralLanguage(); - return !lang.equals(""); - } - public static String getType(CharSequence str){ - - Node node; - char firstChar = str.charAt(0); - // TODO split blank nodes as well in a seperate section -// if(firstChar=='_') { -// node = JenaNodeCreator.createAnon(str.toString()); -// } - if(firstChar=='"') { - node = JenaNodeCreator.createLiteral(str.toString()); - String dataType = node.getLiteralDatatypeURI(); - return "<"+dataType+">"; - }else{ - return "NO_DATATYPE"; - } - -// Matcher matcher = pattern.matcher(str); -// String dataType; -// if(matcher.find()){ -// dataType = str.toString().split("\\^")[2]; -// }else{ -// dataType = "NO_DATATYPE"; -// } -// return dataType; - } - public static String removeType(CharSequence str){ - String res = ""; -// char firstChar = str.charAt(0); -// if(firstChar == '"'){ -// Node node = JenaNodeCreator.createLiteral(str.toString()); -// res = node.getLiteralValue().toString(); -// String str1 = node.getLiteral().toString(); -// return res; -// } -// return str.toString(); - Matcher matcher = pattern.matcher(str); - if(matcher.matches()){ - String temp = str.toString(); - int index = temp.lastIndexOf("^"); - res = temp.substring(0,index-1); - - //res = str.toString().split("\\^")[0]; - }else{ - res = str.toString(); - } - return res; - } + public static final String NO_DATATYPE_STR = "NO_DATATYPE"; + static final String LITERAL_LANG_TYPE_STR = ""; + /** + * no datatype type + */ + public static final CharSequence NO_DATATYPE = new CompactString(NO_DATATYPE_STR); + public static final CharSequence LITERAL_LANG_TYPE = new CompactString(LITERAL_LANG_TYPE_STR); + + /** + * test if the node is a literal and contains a language + * + * @param str the node + * @return true if the node is a literal and contains a language, false otherwise + * @throws java.util.ConcurrentModificationException if the node is updated while reading + */ + public static boolean containsLanguage(CharSequence str) { + if (str.length() == 0 || str.charAt(0) != '"') { + return false; // not a literal + } + + for (int i = str.length() - 1; i >= 0; i--) { + char c = str.charAt(i); + + // https://www.w3.org/TR/n-triples/#n-triples-grammar + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '-') { + // lang tag, ignore + continue; + } + + if (c == '"') { + // end the literal, no lang tag + return false; + } + + // start of the lang tag + return c == '@'; + } + throw new ConcurrentModificationException("Update of the char sequence while reading!"); + } + + /** + * get the index of the last ^^ of the literal type + * + * @param str node + * @return index of the start of the uri type + * @throws java.util.ConcurrentModificationException if the node is updated while reading + */ + private static int getTypeIndex(CharSequence str) { + if (str.length() == 0 || str.charAt(0) != '"' || str.charAt(str.length() - 1) != '>') { + return -1; // not a literal + } + int i = str.length() - 1; + + // find end of the type + while (i > 0) { + if (str.charAt(i) == '<' && str.charAt(i - 1) != '\\') { + break; + } + i--; + } + + char c = str.charAt(i - 1); + + // https://www.w3.org/TR/n-triples/#n-triples-grammar + if (c == '"' || c == '@') { + return -1; // no type, syntax error???? + } + + if (c == '^') { + return i; + } + + throw new ConcurrentModificationException("Update of the char sequence while reading!"); + } + + /** + * test if the node is a literal and contains a language + * + * @param str the node + * @return true if the node is a literal and contains a language, false otherwise + * @throws java.util.ConcurrentModificationException if the node is updated while reading + */ + public static CharSequence getType(CharSequence str) { + if (containsLanguage(str)) { + return LITERAL_LANG_TYPE; + } + + int index = getTypeIndex(str); + + if (index != -1 && index < str.length()) { + return str.subSequence(index, str.length()); + } else { + return NO_DATATYPE; + } + } + + /** + * remove the node type if the node is a typed literal, this method return the char sequence or a subSequence of this + * char sequence + * + * @param str the node + * @return node or the typed literal + * @throws java.util.ConcurrentModificationException if the node is updated while reading + */ + public static CharSequence removeType(CharSequence str) { + int index = getTypeIndex(str); + + if (index != -1 && index < str.length()) { + return str.subSequence(0, index - 2); + } else { + return str; + } + } + + static boolean isLangType(CharSequence s, int start) { + if (start + LITERAL_LANG_TYPE_STR.length() > s.length()) { + return false; + } + // we can use the string version because the langString IRI is in ASCII + for (int i = 0; i < LITERAL_LANG_TYPE_STR.length(); i++) { + if (s.charAt(i + start) != LITERAL_LANG_TYPE_STR.charAt(i)) { + return false; + } + } + return true; + } + + /** + * place the type before the literal + * + *

example: {@literal "aa"^^} -> {@literal ^^"aa"}

+ *

example: "aa" -> "aa"

+ *

example: "aa"@fr -> {@literal ^^"aa"@fr}

+ * + * @param str the literal + * @return prefixed literal + */ + public static CharSequence litToPref(CharSequence str) { + // language literal + if (containsLanguage(str)) { + ReplazableString prefixedValue = new ReplazableString(2 + LITERAL_LANG_TYPE.length() + str.length()); + prefixedValue.append(new byte[]{'^', '^'}, 0, 2); + prefixedValue.append(((CompactString) LITERAL_LANG_TYPE).getData()); + prefixedValue.appendNoCompact(str); + return prefixedValue; + } + + int index = getTypeIndex(str); + + // typed literal + if (index != -1 && index < str.length()) { + // add the literal value + ReplazableString prefixedValue = new ReplazableString(str.length()); + prefixedValue.append(new byte[]{'^', '^'}, 0, 2); + prefixedValue.appendNoCompact(str, index, str.length() - index); + prefixedValue.appendNoCompact(str, 0, index - 2); + return prefixedValue; + } + + return str; + } + + /** + * remove the type of a prefixed literal + * + * @param str the prefixed literal + * @return literal + * @see #removeType(CharSequence) + */ + public static CharSequence removePrefType(CharSequence str) { + if (str.length() < 4 || !(str.charAt(0) == '^' && str.charAt(1) == '^')) { + // prefixed type + return str; + } + + assert str.charAt(2) == '<' : "non typed literal prefix"; + + int index = 3; + + while (index < str.length()) { + char c = str.charAt(index); + if (c == '>') { + break; + } + index++; + } + assert index < str.length() - 1 && str.charAt(index + 1) == '"' : "badly typed literal prefix"; + + return str.subSequence(index + 1, str.length()); + } + + /** + * replace the literal before the type + * + *

example: {@literal ^^"aa"} -> {@literal "aa"^^}

+ *

example: "aa" -> "aa"

+ *

example: {@literal ^^"aa"@fr} -> "aa"@fr

+ * + * @param str the prefixed literal + * @return literal + */ + public static CharSequence prefToLit(CharSequence str) { + if (str.length() < 4 || !(str.charAt(0) == '^' && str.charAt(1) == '^')) { + return str; + } + + assert str.charAt(2) == '<' : "non typed literal prefix"; + + int index = 3; + + if (isLangType(str, 2)) { + // lang type, return without the type + return str.subSequence(LITERAL_LANG_TYPE.length() + 2, str.length()); + } + + while (index < str.length()) { + char c = str.charAt(index); + if (c == '>') { + break; + } + index++; + } + assert index < str.length() - 1 && str.charAt(index + 1) == '"' : "badly typed literal prefix"; + + ReplazableString bld = new ReplazableString(str.length()); + bld.appendNoCompact(str, index + 1, str.length() - index - 1); + bld.appendNoCompact(str, 0, index + 1); + return bld; + } + + /** + * add {@literal '<'} and {@literal '>'} to a CharSequence, will have the same behaviors as a byte string + * + * @param s1 string + * @return embed version of s1 + */ + public static CharSequence embed(CharSequence s1) { + s1 = DelayedString.unwrap(s1); + if (s1 == null || s1.length() == 0) { + return EmbeddedURI.EMPTY; + } + if (s1.charAt(0) == '<' && s1.charAt(s1.length() - 1) == '>') { + return s1; + } + return new EmbeddedURI(s1); + } + + private static class EmbeddedURI implements CharSequence { + private static final CharSequence EMPTY = new CompactString("<>"); + private int hash; + private final CharSequence parent; + + public EmbeddedURI(CharSequence parent) { + this.parent = parent; + } + + @Override + public int length() { + return parent.length() + 2; + } + + @Override + public char charAt(int index) { + if (index == 0) { + return '<'; + } + if (index == parent.length() + 1) { + return '>'; + } + return parent.charAt(index - 1); + } + + @Override + public CharSequence subSequence(int start, int end) { + if (start == 0 && end == length()) { + return this; + } + if (start == 0 || end == length()) { + return new CompactString(this.toString().subSequence(start, end)); + } + + return parent.subSequence(start - 1, end - 1); + } + + @Override + public String toString() { + return "<" + parent + ">"; + } + + @Override + public boolean equals(Object o) { + if (o == null) { + return false; + } + if (this == o) { + return true; + } + if (!(o instanceof CharSequence)) { + return false; + } + + return CharSequenceComparator.getInstance().compare(this, (CharSequence) o) == 0; + } + + @Override + public int hashCode() { + // FNV Hash function: http://isthe.com/chongo/tech/comp/fnv/ + if (hash == 0) { + hash = (int) 2166136261L; + int i = length(); + + while (i-- != 0) { + hash = (hash * 16777619) ^ charAt(i); + } + } + return hash; + } + } + + /** + * test if a sequence is a No datatype string + * + * @param seq sequence + * @return true if seq == "NO_DATATYPE" + */ + public static boolean isNoDatatype(CharSequence seq) { + if (seq.length() != NO_DATATYPE.length()) { + return false; + } + for (int i = 0; i < NO_DATATYPE.length(); i++) { + if (NO_DATATYPE.charAt(i) != seq.charAt(i)) { + return false; + } + } + return true; + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java index 3507004b..f99eaec8 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java @@ -41,7 +41,7 @@ public static Profiler readFromDisk(Path inputPath) throws IOException { throw new IOException("Missing header for the profiling file!"); } } - p.mainSection = p.new Section(is); + p.mainSection = p.new Section(is, 0); if (!is.readCRCAndCheck()) { throw new IllegalArgumentException("CRC doesn't match when reading the CRC!"); } @@ -74,7 +74,7 @@ public Profiler(String name) { public Profiler(String name, HDTOptions spec) { this.name = Objects.requireNonNull(name, "name can't be null!"); if (spec != null) { - disabled = !"true".equalsIgnoreCase(spec.get(HDTOptionsKeys.PROFILER_KEY)); + disabled = !spec.getBoolean(HDTOptionsKeys.PROFILER_KEY); String profilerOutputLocation = spec.get(HDTOptionsKeys.PROFILER_OUTPUT_KEY); if (profilerOutputLocation != null && !profilerOutputLocation.isEmpty()) { outputPath = Path.of(profilerOutputLocation); @@ -194,7 +194,7 @@ public class Section { * @param is input stream * @throws IOException io exception */ - Section(InputStream is) throws IOException { + Section(InputStream is, int deep) throws IOException { start = VByte.decode(is); end = VByte.decode(is); @@ -202,10 +202,12 @@ public class Section { byte[] nameBytes = IOUtil.readBuffer(is, nameLength, null); name = new String(nameBytes, StandardCharsets.UTF_8); + maxSize = Math.max(name.length() + deep * 2, maxSize); + int subSize = (int) VByte.decode(is); subSections = new ArrayList<>(subSize); for (int i = 0; i < subSize; i++) { - subSections.add(new Section(is)); + subSections.add(new Section(is, deep + 1)); } } @@ -299,8 +301,12 @@ void stop() { end = System.nanoTime(); } + public long getMillis() { + return (end - start) / 1_000_000L; + } + void writeProfiling(String prefix, boolean isLast) { - System.out.println(prefix + (getSubSections().isEmpty() ? "+--" : "+-+") + " [" + getName() + "] " + "-".repeat(1 + maxSize - getName().length()) + " elapsed=" + (end - start) / 1_000_000L + "ms"); + System.out.println(prefix + (getSubSections().isEmpty() ? "+--" : "+-+") + " [" + getName() + "] " + "-".repeat(1 + maxSize - getName().length()) + " elapsed=" + getMillis() + "ms"); for (int i = 0; i < subSections.size(); i++) { Section s = subSections.get(i); s.writeProfiling(prefix + (isLast ? " " : "| "), i == subSections.size() - 1); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/ProfilingUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/ProfilingUtil.java index ae34a3c0..65ea98f9 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/ProfilingUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/ProfilingUtil.java @@ -100,19 +100,19 @@ public static String tidyFileSize(long size ){ if (size >= 1024 * 1024 * 1024) { calcSize = (long) (((double)size) / (1024 * 1024 * 1024)); - str = Long.toString(calcSize) +"GB"; + str = calcSize +"GB"; } else if (size>= 1024 * 1024) { calcSize = (long) (((double)size) / (1024 * 1024 )); - str = Long.toString(calcSize) +"MB"; + str = calcSize +"MB"; } else if (size>= 1024) { calcSize = (long) (((double)size) / (1024)); - str = Long.toString(calcSize) +"KB"; + str = calcSize +"KB"; } else { calcSize = size; - str = Long.toString(calcSize) +"B"; + str = calcSize +"B"; } return str; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/RDFInfo.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/RDFInfo.java index 95e436d5..cbcdb1f6 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/RDFInfo.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/RDFInfo.java @@ -115,10 +115,7 @@ public static long getTriples(HDTOptions specs){ * Checks if "rdf.triples" property was set by the user */ public static boolean triplesSet(HDTOptions specs){ - if (specs.get(triples_prop)!=null) - return true; - else - return false; + return specs.get(triples_prop) != null; } /** @@ -146,7 +143,7 @@ public static float getCompression(HDTOptions specs){ * */ public static long countLines(String filename, RDFParserCallback parser, RDFNotation notation) - throws FileNotFoundException, IOException, ParserException { + throws IOException, ParserException { InputStream is = new BufferedInputStream(new FileInputStream(filename)); try { byte[] c = new byte[1024]; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java index c215e2d2..638743ab 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java @@ -1,6 +1,11 @@ package org.rdfhdt.hdt.util.concurrent; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; import java.util.Objects; +import java.util.Stack; +import java.util.concurrent.atomic.AtomicLong; /** * Thread allowing exception and returning it when joining it with {@link #joinAndCrashIfRequired()} or by using @@ -10,6 +15,59 @@ * @author Antoine Willerval */ public class ExceptionThread extends Thread { + private static final AtomicLong ID_COUNT = new AtomicLong(); + static boolean debug; + static final Stack> DEBUG_STACK = new Stack<>(); + + /** + * start the debug of the thread + */ + public static void startDebug() { + debug = true; + if (!DEBUG_STACK.isEmpty()) { + throw new IllegalArgumentException("non empty debug stack, bad config?"); + } + pushDebugPoint(); + } + + /** + * push a new sub-set of debug thread + */ + public static void pushDebugPoint() { + DEBUG_STACK.push(Collections.synchronizedMap(new HashMap<>())); + } + + /** + * push a new sub-set of debug thread + * + * @param name name to id the pop + */ + public static void popDebugPoint(String name) { + if (DEBUG_STACK.isEmpty()) { + throw new IllegalArgumentException("empty debug stack, bad config?"); + } + + Map map = DEBUG_STACK.pop(); + if (map.isEmpty()) { + return; + } + + AssertionError error = new AssertionError("Non empty stack at point " + name); + + map.values().forEach(error::addSuppressed); + + throw error; + } + + /** + * end the debug of the thread + */ + public static void endDebug() { + debug = false; + popDebugPoint("end debug"); + DEBUG_STACK.clear(); + } + /** * create exception threads of multiple runnables * @@ -17,7 +75,7 @@ public class ExceptionThread extends Thread { * @param runnables the runnables list, can't be empty * @return exception thread attached with other runnables * @throws java.lang.IllegalArgumentException if the array is empty - * @throws java.lang.NullPointerException if an argument is null + * @throws java.lang.NullPointerException if an argument is null */ public static ExceptionThread async(String name, ExceptionRunnable... runnables) { Objects.requireNonNull(name, "name can't be null!"); @@ -47,8 +105,8 @@ public interface ExceptionRunnable { /** * Runnable used in an {@link org.rdfhdt.hdt.util.concurrent.ExceptionThread}, can throw an exception * - * @see org.rdfhdt.hdt.util.concurrent.ExceptionThread#ExceptionThread(org.rdfhdt.hdt.util.concurrent.ExceptionThread.ExceptionRunnable, String) * @throws java.lang.Exception if any + * @see org.rdfhdt.hdt.util.concurrent.ExceptionThread#ExceptionThread(org.rdfhdt.hdt.util.concurrent.ExceptionThread.ExceptionRunnable, String) */ void run() throws Exception; } @@ -57,10 +115,28 @@ public interface ExceptionRunnable { private final ExceptionRunnable target; private ExceptionThread next; private ExceptionThread prev; + private final Map debugMap; + private final long debugId; + + public ExceptionThread(String name) { + this(null, name); + } public ExceptionThread(ExceptionRunnable target, String name) { super(name); - this.target = target; + debugId = ID_COUNT.getAndIncrement(); + + if (debug) { + debugMap = DEBUG_STACK.peek(); + if (debugMap != null) { + // debug + debugMap.put(debugId, new Throwable("ExceptionThread #" + name)); + } + } else { + debugMap = null; + } + + this.target = Objects.requireNonNullElse(target, this::runException); } /** @@ -105,6 +181,14 @@ public ExceptionThread startAll() { return this; } + /** + * implementation used if the runnable is null + * @throws Exception exception + */ + public void runException() throws Exception { + // to impl + } + @Override public final void run() { try { @@ -121,6 +205,10 @@ public final void run() { if (this.prev != null) { this.prev.interruptBackward(t); } + } finally { + if (debugMap != null) { + debugMap.remove(debugId); + } } } @@ -152,7 +240,7 @@ public Throwable getException() { * created. If the thread returned an exception while the current thread is interrupted, the exception will be * suppressed in the {@link java.lang.InterruptedException}. * - * @throws InterruptedException interruption while joining the thread + * @throws InterruptedException interruption while joining the thread * @throws ExceptionThreadException if the thread or any attached thread returned an exception */ public void joinAndCrashIfRequired() throws InterruptedException { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/KWayMerger.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/KWayMerger.java index 3a0bda82..15187a39 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/KWayMerger.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/KWayMerger.java @@ -259,7 +259,7 @@ public CloseSuppressPath getPath() { } } - private static class Worker extends Thread { + private static class Worker extends ExceptionThread { private final KWayMerger parent; public Worker(String name, KWayMerger parent) { @@ -268,7 +268,7 @@ public Worker(String name, KWayMerger parent) { } @Override - public void run() { + public void runException() throws Exception { try { KWayMergerRunnable task; @@ -277,6 +277,7 @@ public void run() { } } catch (Throwable t) { parent.exception(t); + throw t; } } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseInputStream.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseInputStream.java index aad0d5b8..ed721d25 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseInputStream.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseInputStream.java @@ -12,8 +12,8 @@ */ public class CloseInputStream extends InputStream { - private InputStream in; - private Closeable toClose; + private final InputStream in; + private final Closeable toClose; public CloseInputStream(InputStream in, Closeable toClose) { this.in = in; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java index 979f56d7..ca330050 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java @@ -28,6 +28,7 @@ import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.util.Reference; @@ -238,7 +239,7 @@ public static String readLine(InputStream in, char character) throws IOException } buf.write(value); } - return new String(buf.toByteArray()); // Uses default encoding + return buf.toString(); // Uses default encoding } public static String readChars(InputStream in, int numChars) throws IOException { @@ -257,6 +258,17 @@ public static void writeString(OutputStream out, String str) throws IOException out.write(str.getBytes(ByteStringUtil.STRING_ENCODING)); } + + public static void writeSizedBuffer(OutputStream output, byte[] buffer, ProgressListener listener) throws IOException { + writeSizedBuffer(output, buffer, 0, buffer.length, listener); + } + + public static void writeSizedBuffer(OutputStream output, byte[] buffer, int offset, int length, ProgressListener listener) throws IOException { + // FIXME: Do by blocks and notify listener + VByte.encode(output, length); + output.write(buffer, offset, length); + } + public static void writeBuffer(OutputStream output, byte[] buffer, int offset, int length, ProgressListener listener) throws IOException { // FIXME: Do by blocks and notify listener output.write(buffer, offset, length); @@ -298,7 +310,7 @@ public static void copyFile(File src, File dst) throws IOException { public static void moveFile(File src, File dst) throws IOException { copyFile(src, dst); - src.delete(); + Files.deleteIfExists(src.toPath()); } public static void decompressGzip(File src, File trgt) throws IOException { @@ -319,9 +331,9 @@ public static void decompressGzip(File src, File trgt) throws IOException { /** * Write long, little endian * - * @param output - * @param value - * @throws IOException + * @param output os + * @param value long + * @throws IOException io exception */ public static void writeLong(OutputStream output, long value) throws IOException { byte[] writeBuffer = new byte[8]; @@ -340,8 +352,8 @@ public static void writeLong(OutputStream output, long value) throws IOException /** * Read long, little endian. * - * @param input - * @throws IOException + * @param input is + * @throws IOException io exception */ public static long readLong(InputStream input) throws IOException { int n = 0; @@ -367,9 +379,9 @@ public static long readLong(InputStream input) throws IOException { /** * Write int, little endian * - * @param output - * @param value - * @throws IOException + * @param output os + * @param value value + * @throws IOException io exception */ public static void writeInt(OutputStream output, int value) throws IOException { byte[] writeBuffer = new byte[4]; @@ -418,6 +430,13 @@ public static int byteArrayToInt(byte[] value) { return (value[3] << 24) + (value[2] << 16) + (value[1] << 8) + (value[0] << 0); } + public static byte[] readSizedBuffer(InputStream input, ProgressListener listener) throws IOException { + long size = VByte.decode(input); + if (size > Integer.MAX_VALUE - 5 || size < 0) { + throw new IOException("Read bad sized buffer: " + size); + } + return readBuffer(input, (int) size, listener); + } /** * @param input din * @param length bytes @@ -517,6 +536,7 @@ public static void closeQuietly(Closeable output) { try { output.close(); } catch (IOException e) { + // ignore } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java index 20e38217..8008ad70 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java @@ -9,10 +9,12 @@ import org.rdfhdt.hdt.util.crc.CRC8; import org.rdfhdt.hdt.util.crc.CRCInputStream; import org.rdfhdt.hdt.util.string.ReplazableString; +import org.rdfhdt.hdt.utils.DebugOrderNodeIterator; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; +import java.util.function.Consumer; /** * Class to read a compress node file @@ -27,6 +29,7 @@ public class CompressNodeReader implements ExceptionIterator consumer; public CompressNodeReader(InputStream stream) throws IOException { this.stream = new CRCInputStream(stream, new CRC8()); @@ -37,6 +40,7 @@ public CompressNodeReader(InputStream stream) throws IOException { this.stream.setCRC(new CRC32()); this.tempString = new ReplazableString(); this.last = new IndexedNode(tempString, -1); + consumer = DebugOrderNodeIterator.of("stream", true); } public long getSize() { @@ -61,6 +65,7 @@ public IndexedNode read() throws IOException { tempString.replace2(stream, delta); long index = VByte.decode(stream); last.setIndex(index); + consumer.accept(last); waiting = true; return last; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java index be9018ef..369dc8fa 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java @@ -3,6 +3,7 @@ import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.ReplazableString; @@ -183,8 +184,10 @@ public boolean hasNext() { } while (it.hasNext()) { IndexedNode node = it.next(); - CharSequence next = node.getNode(); - if (CharSequenceComparator.getInstance().compare(prev, next) == 0) { + ByteString next = (ByteString) node.getNode(); + int cmp = prev.compareTo(next); + assert cmp <= 0: "bad order : " + prev + " > " + next; + if (cmp == 0) { // same as previous, ignore assert this.id != node.getIndex() : "same index and prevIndex"; duplicatedNodeConsumer.onDuplicated(this.id, node.getIndex(), lastHeader); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java index 82575686..b8f54892 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java @@ -65,7 +65,7 @@ public static void notifyCond(ProgressListener listener, String message, long co public static MultiThreadListener multiThreadListener(ProgressListener listener) { // null, create an empty one if (listener == null) { - return new PrefixMultiThreadListener((a, b) -> { + return((a, b, c) -> { }); } // already a multi thread listener diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java new file mode 100644 index 00000000..aeca8775 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java @@ -0,0 +1,30 @@ +package org.rdfhdt.hdt.util.string; + +public interface ByteString extends CharSequence, Comparable { + static ByteString of(CharSequence sec) { + return ByteStringUtil.asByteString(sec); + } + + byte[] getBuffer(); + + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + default int compareTo(ByteString other) { + int n = Math.min(length(), other.length()); + byte[] buffer1 = getBuffer(); + byte[] buffer2 = other.getBuffer(); + + int k = 0; + while (k < n) { + byte c1 = buffer1[k]; + byte c2 = buffer2[k]; + if (c1 != c2) { + return (c1 & 0xFF) - (c2 & 0xFF); + } + k++; + } + return length() - other.length(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java index 6ba9ecc4..50d14817 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java @@ -69,6 +69,30 @@ public static String asString(ByteBuffer buff, int offset) { return new String(arr, STRING_ENCODING); } + /** + * convert this char sequence to a byte string (if required) + * + * @param sec the char sequence + * @return byte string + */ + public static ByteString asByteString(CharSequence sec) { + sec = DelayedString.unwrap(sec); + + if (sec == null) { + return null; + } + + if (sec.length() == 0) { + return CompactString.EMPTY; + } + + if (sec instanceof ByteString) { + return (ByteString) sec; + } + // clone into sec + return new CompactString(sec); + } + public static int strlen(byte [] buff, int off) { int len = buff.length; int pos = off; @@ -305,31 +329,34 @@ public static int strcmp(CharSequence str, BigMappedByteBuffer buffer, long offs } public static int append(OutputStream out, CharSequence str, int start) throws IOException { - byte [] bytes; - int len; - if(str instanceof DelayedString) { str = ((DelayedString) str).getInternal(); } - + if(str instanceof String) { - bytes = ((String) str).getBytes(ByteStringUtil.STRING_ENCODING); - len = bytes.length; - } else if(str instanceof CompactString) { - bytes = ((CompactString) str).getData(); - len = bytes.length; - } else if(str instanceof ReplazableString) { - bytes = ((ReplazableString) str).getBuffer(); - len = ((ReplazableString) str).used; + return append(out, (String) str, start); + } else if(str instanceof ByteString) { + return append(out, (ByteString) str, start); } else { throw new NotImplementedException(); } - + } + + public static int append(OutputStream out, ByteString str, int start) throws IOException { + return append(out, str.getBuffer(), start, str.length()); + } + + public static int append(OutputStream out, String str, int start) throws IOException { + byte[] bytes = str.getBytes(ByteStringUtil.STRING_ENCODING); + return append(out, bytes, start, bytes.length); + } + + public static int append(OutputStream out, byte [] bytes, int start, int len) throws IOException { // Write and remove null characters int cur = start; int ini = start; int written = 0; - + while(cur getInstance() { return instance; } - + + private final Comparator base = CharSequenceComparator.getInstance(); + /* (non-Javadoc) * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) */ @@ -50,36 +52,15 @@ public int compare(CharSequence s1, CharSequence s2) { if (s1 == s2) { return 0; } - String type1 = LiteralsUtils.getType(s1); - String type2 = LiteralsUtils.getType(s2); - int x = type1.compareTo(type2); + CharSequence type1 = LiteralsUtils.getType(s1); + CharSequence type2 = LiteralsUtils.getType(s2); + + int x = base.compare(type1, type2); + if (x != 0) { return x; } else { // data types are equal - s1 = DelayedString.unwrap(s1); - s2 = DelayedString.unwrap(s2); - - if (s1 instanceof CompactString && s2 instanceof CompactString) { - CompactString cs1 = (CompactString) s1; - CompactString cs2 = (CompactString) s2; - return cs1.compareTo(cs2); - } - - if (s1 instanceof String && s2 instanceof String) { - String rs1 = (String) s1; - String rs2 = (String) s2; - return rs1.compareTo(rs2); - } - - if (s1 instanceof ReplazableString && s2 instanceof ReplazableString) { - ReplazableString cs1 = (ReplazableString) s1; - ReplazableString cs2 = (ReplazableString) s2; - return cs1.compareTo(cs2); - } - - // Slower but safe - - return s1.toString().compareTo(s2.toString()); + return base.compare(s1, s2); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CompactString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CompactString.java index e6be6a0f..a0109a53 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CompactString.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CompactString.java @@ -37,7 +37,7 @@ * @author mario.arias * */ -public class CompactString implements CharSequence, Serializable, Comparable { +public class CompactString implements CharSequence, Serializable, ByteString { private static final long serialVersionUID = 6789858615261959413L; @@ -53,12 +53,13 @@ private CompactString() { this.data = new byte[0]; } - public CompactString(ReplazableString str) { - data = Arrays.copyOf( str.buffer, str.used ); + public CompactString(ByteString str) { + data = Arrays.copyOf( str.getBuffer(), str.length() ); } public CompactString(CompactString other) { data = Arrays.copyOf( other.data, other.data.length); + hash = other.hash; } public CompactString(String other) { @@ -66,14 +67,28 @@ public CompactString(String other) { } public CompactString(CharSequence other) { - data = other.toString().getBytes(ByteStringUtil.STRING_ENCODING); + if (other instanceof CompactString) { + CompactString str = (CompactString) other; + data = Arrays.copyOf(str.data, str.data.length); + hash = str.hash; + } else if (other instanceof ReplazableString) { + ReplazableString str = (ReplazableString) other; + data = Arrays.copyOf(str.buffer, str.used); + } else { + data = other.toString().getBytes(ByteStringUtil.STRING_ENCODING); + } } public byte [] getData() { return data; } - private CompactString(byte[] data) { + @Override + public byte[] getBuffer() { + return getData(); + } + + public CompactString(byte[] data) { this.data = data; } @@ -97,11 +112,7 @@ public int lastIndexOf(char ch) { @Override public char charAt(int index) { - int ix = index; - if (ix >= data.length) { - throw new StringIndexOutOfBoundsException("Invalid index " + index + " length " + length()); - } - return (char) (data[ix] & 0xff); + return (char) (data[index] & 0xff); } @Override @@ -168,25 +179,6 @@ public boolean equals(Object o) { throw new NotImplementedException(); } - /* (non-Javadoc) - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - @Override - public int compareTo(CompactString other) { - int n = Math.min( this.data.length, other.data.length); - - int k = 0; - while (k < n) { - int c1 = this.data[k]&0xFF; - int c2 = other.data[k]&0xFF; - if (c1 != c2) { - return c1 - c2; - } - k++; - } - return this.data.length - other.data.length; - } - public CharSequence getDelayed() { return new DelayedString(this); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java index 48865bef..ef9be8f4 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java @@ -35,6 +35,7 @@ import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.util.io.BigByteBuffer; import org.rdfhdt.hdt.util.io.BigMappedByteBuffer; +import org.rdfhdt.hdt.util.io.IOUtil; /** @@ -44,7 +45,7 @@ * @author mario.arias * */ -public final class ReplazableString implements CharSequence, Comparable { +public final class ReplazableString implements CharSequence, ByteString { byte [] buffer; int used; @@ -62,7 +63,8 @@ public ReplazableString(byte [] buffer) { this.buffer = buffer; this.used = buffer.length; } - + + @Override public byte [] getBuffer() { return buffer; } @@ -72,7 +74,11 @@ private void ensureSize(int size) { buffer = Arrays.copyOf(buffer, Math.max(size, buffer.length * 2)); } } - + + public void append(byte [] data) { + this.append(data, 0, data.length); + } + public void append(byte [] data, int offset, int len) { this.replace(used, data, offset, len); } @@ -89,12 +95,39 @@ public void append(CharSequence other) { used+=other.length(); } - public void replace(CharSequence other) { + public void appendNoCompact(CharSequence other) { + other = DelayedString.unwrap(other); + if (other instanceof ReplazableString) { - ReplazableString o2 = (ReplazableString) other; - ensureSize(o2.used); - System.arraycopy(o2.buffer, 0, buffer, 0, o2.used); - used = o2.used; + ReplazableString rs = (ReplazableString) other; + this.append(rs.getBuffer(), 0, rs.used); + } else if (other instanceof CompactString) { + this.append(((CompactString) other).getData()); + } else { + this.append(other.toString().getBytes(ByteStringUtil.STRING_ENCODING)); + } + } + + public void appendNoCompact(CharSequence other, int offset, int length) { + other = DelayedString.unwrap(other); + + if (other instanceof ByteString) { + this.append(((ByteString) other).getBuffer(), offset, length); + } else { + this.append(other.toString().substring(offset, offset + length) + .getBytes(ByteStringUtil.STRING_ENCODING)); + } + } + + public void replace(ByteString other) { + ensureSize(other.length()); + System.arraycopy(other.getBuffer(), 0, buffer, 0, other.length()); + used = other.length(); + } + + public void replace(CharSequence other) { + if (other instanceof ByteString) { + replace((ByteString) other); } else { used = 0; byte[] bytes = other.toString().getBytes(StandardCharsets.UTF_8); @@ -116,9 +149,8 @@ public void replace(int pos, BigByteBuffer data, long offset, int len) { } public void replace(InputStream in, int pos, int len) throws IOException { - ensureSize(pos+len); - in.read(buffer, pos, len); - used = pos+len; + byte[] buffer = IOUtil.readBuffer(in, len, null); + replace(pos, buffer, 0, len); } public void replace(ByteBuffer in, int pos, int len) throws IOException { @@ -225,6 +257,9 @@ public void replace(BigMappedByteBuffer in, int pos) throws IOException { */ @Override public char charAt(int index) { + if (index >= used) { + throw new StringIndexOutOfBoundsException("Invalid index " + index + " length " + length()); + } return (char)(buffer[index] & 0xFF); } @@ -312,25 +347,6 @@ public String toString() { return new String(buffer, 0, used, ByteStringUtil.STRING_ENCODING); } - /* (non-Javadoc) - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - @Override - public int compareTo(ReplazableString other) { - int n = Math.min(used, other.used); - - int k = 0; - while (k < n) { - int c1 = this.buffer[k] & 0xFF; - int c2 = other.buffer[k] & 0xFF; - if (c1 != c2) { - return c1 - c2; - } - k++; - } - return used - other.used; - } - public CharSequence getDelayed() { return new DelayedString(this); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java new file mode 100644 index 00000000..cd430cc7 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java @@ -0,0 +1,71 @@ +package org.rdfhdt.hdt.utils; + +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.ReplazableString; + +import java.util.Comparator; +import java.util.function.Consumer; + +public class DebugOrderNodeIterator implements Consumer { + public static boolean isAssertEnable() { + try { + assert false; + return false; + } catch (AssertionError e) { + return true; + } + } + + public static Consumer of(String name) { + return of(name, false); + } + + public static Consumer of(String name, boolean allowDuplicated) { + return of(isAssertEnable(), name, allowDuplicated); + } + + public static Consumer of(boolean debug, String name) { + return of(debug, name, false); + } + + public static Consumer of(boolean debug, String name, boolean allowDuplicated) { + return of(debug, name, allowDuplicated, CharSequenceComparator.getInstance()); + } + + public static Consumer of(boolean debug, String name, boolean allowDuplicated, Comparator comparator) { + if (debug) { + return new DebugOrderNodeIterator(comparator, name, allowDuplicated); + } + return (t) -> { + }; + } + + private final Comparator comparator; + private final String name; + private final ReplazableString prevBuffer = new ReplazableString(16); + private final boolean allowDuplicated; + + private DebugOrderNodeIterator(Comparator comparator, String name, boolean allowDuplicated) { + this.comparator = comparator; + this.name = name; + this.allowDuplicated = allowDuplicated; + } + + + @Override + public void accept(IndexedNode obj) { + CharSequence node = obj.getNode(); + if (prevBuffer.length() != 0) { + int cmp = comparator.compare(prevBuffer, node); + if (cmp == 0 && !allowDuplicated) { + throw new AssertionError("DUPLICATION ERROR: prevBuffer == comparator for string '" + node + "' == '" + prevBuffer + "' in section " + name); + } + if (cmp > 0) { + throw new AssertionError("ORDER ERROR: prevBuffer > comparator for string '" + node + "' > '" + prevBuffer + "' in section " + name); + } + } + prevBuffer.replace(node); + } + +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java index 13e0315b..02f7d593 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java @@ -9,6 +9,7 @@ import org.rdfhdt.hdt.triples.IndexedNode; import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.io.compress.CompressTest; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import java.io.IOException; @@ -43,7 +44,7 @@ public void compressDictTest() throws Exception { "4444", "7777" ); CompressFourSectionDictionary dictionary = new CompressFourSectionDictionary(result, new FakeNodeConsumer(), (p, m) -> { - }); + }, true); Iterator su = dictionary.getSubjects().getSortedEntries(); Iterator pr = dictionary.getPredicates().getSortedEntries(); Iterator ob = dictionary.getObjects().getSortedEntries(); @@ -114,17 +115,17 @@ public long getTripleCount() { @Override public ExceptionIterator getSubjects() { - return ExceptionIterator.of(new MapIterator<>(Arrays.asList(subjects).iterator(), s -> new IndexedNode(s, sid++))); + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(subjects).iterator(), s -> new IndexedNode(ByteString.of(s), sid++))); } @Override public ExceptionIterator getPredicates() { - return ExceptionIterator.of(new MapIterator<>(Arrays.asList(predicates).iterator(), s -> new IndexedNode(s, pid++))); + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(predicates).iterator(), s -> new IndexedNode(ByteString.of(s), pid++))); } @Override public ExceptionIterator getObjects() { - return ExceptionIterator.of(new MapIterator<>(Arrays.asList(objects).iterator(), s -> new IndexedNode(s, oid++))); + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(objects).iterator(), s -> new IndexedNode(ByteString.of(s), oid++))); } @Override diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java index 693131c4..b6f7f2e1 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java @@ -7,6 +7,7 @@ import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.triples.IndexedNode; import org.rdfhdt.hdt.util.io.compress.CompressUtil; +import org.rdfhdt.hdt.util.string.ByteString; import java.util.Arrays; import java.util.Iterator; @@ -60,7 +61,11 @@ private Iterator removeDupe(List nodes) { return new MapIterator<>( CompressUtil.asNoDupeCharSequenceIterator( - ExceptionIterator.of(nodes.iterator()), + ExceptionIterator.of(nodes.iterator()) + .map(in -> { + in.setNode(ByteString.of(in.getNode())); + return in; + }), (i, j, k) -> { } ), IndexedNode::getNode diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java index 60ef66ec..a1f4ff49 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -1,5 +1,7 @@ package org.rdfhdt.hdt.hdt; +import org.apache.commons.io.file.PathUtils; +import org.junit.After; import org.junit.Before; import org.junit.Ignore; import org.junit.Rule; @@ -8,8 +10,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.junit.runners.Suite; +import org.rdfhdt.hdt.compact.bitmap.BitmapFactory; +import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.impl.MultipleBaseDictionary; import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotFoundException; @@ -27,6 +32,7 @@ import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; import org.rdfhdt.hdt.util.StopWatch; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.io.compress.CompressTest; @@ -34,21 +40,41 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; - -import static org.junit.Assert.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Random; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; @RunWith(Suite.class) @Suite.SuiteClasses({ HDTManagerTest.DynamicDiskTest.class, HDTManagerTest.DynamicCatTreeTest.class, + HDTManagerTest.FileDynamicTest.class, HDTManagerTest.StaticTest.class }) public class HDTManagerTest { private static class HDTManagerTestBase extends AbstractMapMemoryTest implements ProgressListener { - protected static final long SIZE = 1L << 16; + protected static String[][] diskDict() { + return new String[][]{ +// {HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH}, + {HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH} + }; + } + + protected static final long SIZE_VALUE = 1L << 16; + protected static final int SEED = 67; @Rule public TemporaryFolder tempDir = new TemporaryFolder(); protected HDTSpecification spec; @@ -56,7 +82,13 @@ private static class HDTManagerTestBase extends AbstractMapMemoryTest implements @Before public void setupManager() throws IOException { spec = new HDTSpecification(); - spec.set("loader.disk.location", tempDir.newFolder().getAbsolutePath()); + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, tempDir.newFolder().getAbsolutePath()); + ExceptionThread.startDebug(); + } + + @After + public void closeManager() { + ExceptionThread.endDebug(); } @Override @@ -65,16 +97,27 @@ public void notifyProgress(float level, String message) { } protected void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundException { - assertEqualsHDT(expected, actual, 0); - } - protected void assertEqualsHDT(HDT expected, HDT actual, int ignoredHeader) throws NotFoundException { - // test dictionary Dictionary ed = expected.getDictionary(); Dictionary ad = actual.getDictionary(); assertEqualsHDT("Subjects", ed.getSubjects(), ad.getSubjects()); assertEqualsHDT("Predicates", ed.getPredicates(), ad.getPredicates()); - assertEqualsHDT("Objects", ed.getObjects(), ad.getObjects()); + if (ed instanceof MultipleBaseDictionary) { + assertTrue(ad instanceof MultipleBaseDictionary); + MultipleBaseDictionary edm = (MultipleBaseDictionary) ed; + MultipleBaseDictionary adm = (MultipleBaseDictionary) ad; + Map keysE = edm.getAllObjects(); + Map keysA = adm.getAllObjects(); + assertEquals(keysE.keySet(), keysA.keySet()); + keysE.forEach((key, dictE) -> { + DictionarySection dictA = keysA.get(key); + + assertEqualsHDT(key.toString(), dictE, dictA); + }); + } else { + assertFalse(ad instanceof MultipleBaseDictionary); + assertEqualsHDT("Objects", ed.getObjects(), ad.getObjects()); + } assertEqualsHDT("Shared", ed.getShared(), ad.getShared()); assertEquals(ed.getType(), ad.getType()); @@ -94,68 +137,78 @@ protected void assertEqualsHDT(HDT expected, HDT actual, int ignoredHeader) thro // test header assertEquals(actual.getHeader().getBaseURI(), expected.getHeader().getBaseURI()); - if (expected.getHeader().getNumberOfElements() + ignoredHeader != actual.getHeader().getNumberOfElements()) { + if (expected.getHeader().getNumberOfElements() != actual.getHeader().getNumberOfElements()) { StringBuilder bld = new StringBuilder(); bld.append("-------- Header excepted:"); - expected.getHeader().search(null, null, null).forEachRemaining(bld::append); + expected.getHeader().search("", "", "").forEachRemaining(l -> bld.append(l).append('\n')); bld.append("-------- Header actual:"); - actual.getHeader().search(null, null, null).forEachRemaining(bld::append); + actual.getHeader().search("", "", "").forEachRemaining(l -> bld.append(l).append('\n')); - fail("Size of the header doesn't match " + bld + expected.getHeader().getNumberOfElements() + " + " + ignoredHeader + "!=" + actual.getHeader().getNumberOfElements()); + fail("Size of the header doesn't match " + bld + expected.getHeader().getNumberOfElements() + "!=" + actual.getHeader().getNumberOfElements()); } } protected void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { Iterator itEx = excepted.getSortedEntries(); Iterator itAc = actual.getSortedEntries(); + assertEquals("dictionary section sizes don't match", excepted.getNumberOfElements(), actual.getNumberOfElements()); while (itEx.hasNext()) { - assertTrue(itAc.hasNext()); + assertTrue("dictionary section " + section + " is less big than excepted", itAc.hasNext()); CharSequence expectedTriple = itEx.next(); CharSequence actualTriple = itAc.next(); CompressTest.assertCharSequenceEquals(section + " section strings", expectedTriple, actualTriple); } - assertFalse(itAc.hasNext()); - assertEquals(excepted.getNumberOfElements(), actual.getNumberOfElements()); + assertFalse("dictionary section " + section + " is bigger than excepted", itAc.hasNext()); } } @RunWith(Parameterized.class) public static class DynamicDiskTest extends HDTManagerTestBase { - @Parameterized.Parameters(name = "{0}") + @Parameterized.Parameters(name = "{7} - {0}") public static Collection params() { List params = new ArrayList<>(); - for (int threads : new int[]{ - // sync - 1, - // async, low thread count - 2, - // async, large thread count - 8 - }) { - List modes; - if (threads > 1) { - // async, no need for partial - modes = List.of( - HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE - ); - } else { - modes = List.of( - HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL, - HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE - ); - } - for (String mode : modes) { - params.addAll(List.of( - new Object[]{"base-w" + threads + "-" + mode, SIZE * 8, 20, 50, threads, mode, false}, - new Object[]{"duplicates-w" + threads + "-" + mode, SIZE * 8, 10, 50, threads, mode, false}, - new Object[]{"large-literals-w" + threads + "-" + mode, SIZE * 2, 20, 250, threads, mode, false}, - new Object[]{"quiet-w" + threads + "-" + mode, SIZE * 8, 10, 50, threads, mode, false} - )); + for (String[] dict : diskDict()) { + params.addAll(List.of( + new Object[]{"slow-str1", 10, 2, 4, 2, HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE, false, dict[0], dict[1], 2, "debug.disk.slow.stream=true"}, + new Object[]{"slow-str2", 10, 2, 4, 2, HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE, false, dict[0], dict[1], 2, "debug.disk.slow.stream2=true"}, + new Object[]{"slow-cfsd", 10, 2, 4, 2, HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE, false, dict[0], dict[1], 2, "debug.disk.slow.pfsd=true"}, + new Object[]{"slow-kw-d", 10, 2, 4, 2, HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE, false, dict[0], dict[1], 2, "debug.disk.slow.kway.dict=true"}, + new Object[]{"slow-kw-t", 10, 2, 4, 2, HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE, false, dict[0], dict[1], 2, "debug.disk.slow.kway.triple=true"} + )); + for (int threads : new int[]{ + // sync + 1, + // async, low thread count + 2, + // async, large thread count + 8 + }) { + List modes; + if (threads > 1) { + // async, no need for partial + modes = List.of( + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE + ); + } else { + modes = List.of( + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL, + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE + ); + } + for (String mode : modes) { + params.addAll(List.of( + new Object[]{"base-w" + threads + "-" + mode, SIZE_VALUE * 8, 20, 50, threads, mode, false, dict[0], dict[1], SIZE_VALUE, ""}, + new Object[]{"duplicates-w" + threads + "-" + mode, SIZE_VALUE * 8, 10, 50, threads, mode, false, dict[0], dict[1], SIZE_VALUE, ""}, + new Object[]{"large-literals-w" + threads + "-" + mode, SIZE_VALUE * 2, 20, 250, threads, mode, false, dict[0], dict[1], SIZE_VALUE, ""}, + new Object[]{"quiet-w" + threads + "-" + mode, SIZE_VALUE * 8, 10, 50, threads, mode, false, dict[0], dict[1], SIZE_VALUE, ""} + )); + } } } + return params; } @@ -173,19 +226,36 @@ public static Collection params() { public String compressMode; @Parameterized.Parameter(6) public boolean quiet; + @Parameterized.Parameter(7) + public String dictionaryType; + @Parameterized.Parameter(8) + public String tempDictionaryType; + @Parameterized.Parameter(9) + public long size; + @Parameterized.Parameter(10) + public String addedSpecs; @Before public void setupSpecs() { + spec.setOptions(addedSpecs); spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY, String.valueOf(threads)); - spec.set("loader.disk.compressMode", compressMode); + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY, compressMode); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, tempDictionaryType); + spec.set(HDTOptionsKeys.LOADER_DISK_NO_COPY_ITERATOR_KEY, true); } private void generateDiskTest() throws IOException, ParserException, NotFoundException, InterruptedException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(maxSize, 34) + .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) - .withMaxLiteralSize(maxLiteralSize); + .withMaxLiteralSize(maxLiteralSize) + .withSameTripleString(true); + + if (spec.getBoolean("debug.disk.slow.stream")) { + supplier.withSlowStream(25); + } // create DISK HDT LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.GZIP); @@ -232,6 +302,9 @@ private void generateDiskTest() throws IOException, ParserException, NotFoundExc assertNotNull(actual); try { assertEqualsHDT(expected, actual); + } catch (Throwable t) { + HDTTestUtils.printCoDictionary(expected, actual); + throw t; } finally { IOUtil.closeAll(expected, actual); } @@ -241,7 +314,7 @@ private void generateDiskTest() throws IOException, ParserException, NotFoundExc public void generateSaveLoadMapTest() throws IOException, ParserException, NotFoundException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(maxSize, 34) + .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) .withMaxLiteralSize(maxLiteralSize); @@ -269,13 +342,15 @@ public void generateSaveLoadMapTest() throws IOException, ParserException, NotFo @Test public void generateDiskMemTest() throws IOException, ParserException, NotFoundException, InterruptedException { - spec.setInt("loader.disk.chunkSize", SIZE); + spec.set(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY, size); + spec.set("debug.disk.build", true); generateDiskTest(); } @Test public void generateDiskMapTest() throws IOException, ParserException, NotFoundException, InterruptedException { - spec.setInt("loader.disk.chunkSize", SIZE); + spec.set(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY, size); + spec.set("debug.disk.build", true); File mapHDT = tempDir.newFile("mapHDTTest.hdt"); spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); generateDiskTest(); @@ -286,7 +361,7 @@ public void generateDiskMapTest() throws IOException, ParserException, NotFoundE public void catTreeTest() throws IOException, ParserException, NotFoundException, InterruptedException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(maxSize, 34) + .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) .withMaxLiteralSize(maxLiteralSize); @@ -295,7 +370,7 @@ public void catTreeTest() throws IOException, ParserException, NotFoundException HDT actual = null; try { actual = HDTManager.catTree( - RDFFluxStop.sizeLimit(SIZE), + RDFFluxStop.sizeLimit(size), HDTSupplier.memory(), genActual.getStream(), HDTTestUtils.BASE_URI, @@ -325,25 +400,28 @@ public void catTreeTest() throws IOException, ParserException, NotFoundException assertNotNull(expected); assertNotNull(actual); try { - assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + assertEqualsHDT(expected, actual); // -1 for the original size ignored by hdtcat } finally { IOUtil.closeAll(expected, actual); } } + @Test public void catTreeDiskTest() throws IOException, ParserException, NotFoundException, InterruptedException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(maxSize, 34) + .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) .withMaxLiteralSize(maxLiteralSize); + spec.set("debug.disk.build", true); + // create DISK HDT LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.NONE); HDT actual = null; try { actual = HDTManager.catTree( - RDFFluxStop.sizeLimit(SIZE), + RDFFluxStop.sizeLimit(size), HDTSupplier.disk(), genActual.getStream(), HDTTestUtils.BASE_URI, @@ -373,7 +451,7 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep assertNotNull(expected); assertNotNull(actual); try { - assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + assertEqualsHDT(expected, actual); // -1 for the original size ignored by hdtcat } finally { IOUtil.closeAll(expected, actual); } @@ -383,14 +461,16 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep @RunWith(Parameterized.class) public static class DynamicCatTreeTest extends HDTManagerTestBase { - @Parameterized.Parameters(name = "{0}") + @Parameterized.Parameters(name = "{5} - {0}") public static Collection params() { - return List.of( - new Object[]{"base", SIZE * 16, 20, 50, false}, - new Object[]{"duplicates", SIZE * 16, 10, 50, false}, - new Object[]{"large-literals", SIZE * 4, 20, 250, false}, - new Object[]{"quiet", SIZE * 16, 10, 50, false} - ); + List params = new ArrayList<>(); + for (String[] dict : diskDict()) { + params.add(new Object[]{"base", SIZE_VALUE * 16, 20, 50, false, dict[0], dict[1], SIZE_VALUE}); + params.add(new Object[]{"duplicates", SIZE_VALUE * 16, 10, 50, false, dict[0], dict[1], SIZE_VALUE}); + params.add(new Object[]{"large-literals", SIZE_VALUE * 4, 20, 250, false, dict[0], dict[1], SIZE_VALUE}); + params.add(new Object[]{"quiet", SIZE_VALUE * 16, 10, 50, false, dict[0], dict[1], SIZE_VALUE}); + } + return params; } @Parameterized.Parameter @@ -403,12 +483,24 @@ public static Collection params() { public int maxLiteralSize; @Parameterized.Parameter(4) public boolean quiet; + @Parameterized.Parameter(5) + public String dictionaryType; + @Parameterized.Parameter(6) + public String tempDictionaryType; + @Parameterized.Parameter(7) + public long size; + + @Before + public void setupSpecs() { + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, tempDictionaryType); + } @Test public void catTreeTest() throws IOException, ParserException, NotFoundException, InterruptedException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(maxSize, 34) + .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) .withMaxLiteralSize(maxLiteralSize); @@ -419,7 +511,7 @@ public void catTreeTest() throws IOException, ParserException, NotFoundException try { try { actual = HDTManager.catTree( - RDFFluxStop.sizeLimit(SIZE), + RDFFluxStop.sizeLimit(size), HDTSupplier.memory(), genActual.getStream(), HDTTestUtils.BASE_URI, @@ -448,16 +540,17 @@ public void catTreeTest() throws IOException, ParserException, NotFoundException // happy compiler, should throw before assertNotNull(expected); assertNotNull(actual); - assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + assertEqualsHDT(expected, actual); // -1 for the original size ignored by hdtcat } finally { IOUtil.closeAll(expected, actual); } } + @Test public void catTreeDiskTest() throws IOException, ParserException, NotFoundException, InterruptedException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(maxSize, 34) + .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) .withMaxLiteralSize(maxLiteralSize); @@ -466,7 +559,7 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep HDT actual = null; try { actual = HDTManager.catTree( - RDFFluxStop.sizeLimit(SIZE), + RDFFluxStop.sizeLimit(size), HDTSupplier.disk(), genActual.getStream(), HDTTestUtils.BASE_URI, @@ -496,18 +589,19 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep assertNotNull(expected); assertNotNull(actual); try { - assertEqualsHDT(expected, actual, -1); // -1 for the original size ignored by hdtcat + assertEqualsHDT(expected, actual); // -1 for the original size ignored by hdtcat } finally { IOUtil.closeAll(expected, actual); } } } + @RunWith(Parameterized.class) - public static class StaticTest extends HDTManagerTestBase { + public static class FileDynamicTest extends HDTManagerTestBase { @Parameterized.Parameters(name = "{0}") public static Collection params() { return List.of( - new Object[]{"hdtGenDisk/unicode_disk_encode.nt", true} + new Object[]{"hdtGenDisk/unicode_disk_encode.nt", true, SIZE_VALUE} ); } @@ -515,6 +609,8 @@ public static Collection params() { public String file; @Parameterized.Parameter(1) public boolean quiet; + @Parameterized.Parameter(2) + public long size; private void generateDiskTest() throws IOException, ParserException, NotFoundException { @@ -546,32 +642,35 @@ private void generateDiskTest() throws IOException, ParserException, NotFoundExc @Test public void generateDiskCompleteTest() throws IOException, ParserException, NotFoundException { - spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_COMPLETE); - spec.setInt("loader.disk.chunkSize", SIZE); + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY, CompressionResult.COMPRESSION_MODE_COMPLETE); + spec.set(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY, size); + spec.set("debug.disk.build", true); generateDiskTest(); } @Test public void generateDiskPartialTest() throws IOException, ParserException, NotFoundException { - spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_PARTIAL); - spec.setInt("loader.disk.chunkSize", SIZE); + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY, CompressionResult.COMPRESSION_MODE_PARTIAL); + spec.set(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY, size); + spec.set("debug.disk.build", true); generateDiskTest(); } @Test public void generateDiskCompleteMapTest() throws IOException, ParserException, NotFoundException { - spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_COMPLETE); - spec.setInt("loader.disk.chunkSize", SIZE); + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY, CompressionResult.COMPRESSION_MODE_COMPLETE); + spec.set(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY, size); File mapHDT = tempDir.newFile("mapHDTTest.hdt"); spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); + spec.set("debug.disk.build", true); generateDiskTest(); Files.deleteIfExists(mapHDT.toPath()); } @Test public void generateDiskPartialMapTest() throws IOException, ParserException, NotFoundException { - spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_PARTIAL); - spec.setInt("loader.disk.chunkSize", SIZE); + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY, CompressionResult.COMPRESSION_MODE_PARTIAL); + spec.set(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY, size); File mapHDT = tempDir.newFile("mapHDTTest.hdt"); spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); generateDiskTest(); @@ -606,12 +705,149 @@ public void generateTest() throws IOException, ParserException, NotFoundExceptio } } + public static class StaticTest extends HDTManagerTestBase { + @Test + public void multiSectionTest() throws ParserException, IOException, NotFoundException { + Path root = tempDir.getRoot().toPath(); + Path hdtFile = root.resolve("testhdt.hdt"); + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(10_000, 32) + .withMaxLiteralSize(30) + .withUnicode(true); + + // set MultiSectionDictionary type + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); + + try (HDT hdt = HDTManager.generateHDT( + supplier.createTripleStringStream(), + HDTTestUtils.BASE_URI, spec, + null + )) { + assertTrue(hdt.getDictionary() instanceof MultipleBaseDictionary); + String testHdt = hdtFile.toString(); + hdt.saveToHDT( + testHdt, + null + ); + + // test mapping + try (HDT hdt2 = HDTManager.mapHDT(testHdt)) { + assertTrue(hdt2.getDictionary() instanceof MultipleBaseDictionary); + assertEqualsHDT(hdt, hdt2); + } + // test loading + try (HDT hdt2 = HDTManager.loadHDT(testHdt)) { + assertTrue(hdt2.getDictionary() instanceof MultipleBaseDictionary); + assertEqualsHDT(hdt, hdt2); + } + } finally { + Files.deleteIfExists(hdtFile); + } + Path fakeNt = root.resolve("fake.nt"); + try { + supplier.createNTFile(fakeNt); + try (HDT hdt = HDTManager.generateHDT( + fakeNt.toString(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + null + )) { + String testHdt = hdtFile.toString(); + + hdt.saveToHDT(testHdt, null); + + // test mapping + try (HDT hdt2 = HDTManager.mapHDT(testHdt)) { + assertTrue(hdt2.getDictionary() instanceof MultipleBaseDictionary); + assertEqualsHDT(hdt, hdt2); + } + // test loading + try (HDT hdt2 = HDTManager.loadHDT(testHdt)) { + assertTrue(hdt2.getDictionary() instanceof MultipleBaseDictionary); + assertEqualsHDT(hdt, hdt2); + } + } + } finally { + try { + Files.deleteIfExists(fakeNt); + } finally { + Files.deleteIfExists(hdtFile); + } + } + } + + @Test + public void diffMultiSectTest() throws ParserException, IOException, NotFoundException { + Path root = tempDir.getRoot().toPath(); + Path hdtFile = root.resolve("testhdt.hdt"); + Path diffLocation = root.resolve("diff"); + Files.createDirectories(diffLocation); + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(10_000, 32) + .withMaxLiteralSize(30) + .withUnicode(true); + + // set MultiSectionDictionary type + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, "multHash"); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, "dictionaryMultiObj"); + + try (HDT hdt = HDTManager.generateHDT( + supplier.createTripleStringStream(), + HDTTestUtils.BASE_URI, spec, + null + )) { + assertTrue(hdt.getDictionary() instanceof MultipleBaseDictionary); + String testHdt = hdtFile.toString(); + hdt.saveToHDT( + testHdt, + null + ); + + ModifiableBitmap bitmap; + + // test mapping + long n; + try (HDT hdt2 = HDTManager.mapHDT(testHdt)) { + bitmap = BitmapFactory.createRWBitmap(hdt2.getTriples().getNumberOfElements()); + assertTrue(hdt2.getDictionary() instanceof MultipleBaseDictionary); + assertEqualsHDT(hdt, hdt2); + + n = hdt2.getTriples().getNumberOfElements(); + } + + Random rnd = new Random(SEED); + for (long i = 0; i < n / 24; i++) { + bitmap.set(Math.abs(rnd.nextLong()) % n, true); + } + + try (HDT hdtDiff = HDTManager.diffHDTBit( + diffLocation.toAbsolutePath().toString(), + testHdt, + bitmap, + spec, + null + )) { + assertEquals(hdt.getTriples().getNumberOfElements() - bitmap.countOnes(), hdtDiff.getTriples().getNumberOfElements()); + } + } finally { + try { + Files.deleteIfExists(hdtFile); + } finally { + PathUtils.deleteDirectory(diffLocation); + } + } + } + + } + @Ignore("handTests") public static class HandTest extends HDTManagerTestBase { @Test public void bigDiskTest() throws ParserException, IOException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxSize(10_000_000_000L, 94); + .createSupplierWithMaxSize(100_000_000L, 94); Path output = tempDir.newFolder().toPath(); @@ -629,6 +865,12 @@ public void bigDiskTest() throws ParserException, IOException { System.out.println(hdt.getTriples().getNumberOfElements()); } } + + @Test + public void zqdz() { + System.out.println("\255".getBytes(StandardCharsets.UTF_8)[0] & 0xFF); + } + @Test public void bigCatTreeDiskTest() throws ParserException, IOException { HDTOptions spec = new HDTSpecification(); @@ -651,6 +893,7 @@ public void bigCatTreeDiskTest() throws ParserException, IOException { System.out.println(hdt.getTriples().getNumberOfElements()); } } + @Test public void bigGenCatTreeDiskTest() throws ParserException, IOException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier @@ -676,4 +919,5 @@ public void bigGenCatTreeDiskTest() throws ParserException, IOException { } } } + } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterTest.java index 183eab60..6e26f8a3 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterTest.java @@ -40,46 +40,35 @@ private String getFile(String f) { return Objects.requireNonNull(getClass().getClassLoader().getResource(f), "Can't find " + f).getFile(); } @Test - public void bNodeXTest() throws ParserException, IOException, NotFoundException { - HDT hdt = HDTManager.generateHDT(getFile("importer/bnode_x.nt"), HDTTestUtils.BASE_URI, RDFNotation.NTRIPLES, spec, null); - hdt.search("", "", "").forEachRemaining(System.out::println); - hdt.close(); + public void bNodeXTest() throws ParserException, IOException { + HDTManager.generateHDT(getFile("importer/bnode_x.nt"), HDTTestUtils.BASE_URI, RDFNotation.NTRIPLES, spec, null).close(); } @Test - public void bNodeZTest() throws ParserException, IOException, NotFoundException { - HDT hdt = HDTManager.generateHDT(getFile("importer/bnode_z.nt"), HDTTestUtils.BASE_URI, RDFNotation.NTRIPLES, spec, null); - hdt.search("", "", "").forEachRemaining(System.out::println); - hdt.close(); + public void bNodeZTest() throws ParserException, IOException { + HDTManager.generateHDT(getFile("importer/bnode_z.nt"), HDTTestUtils.BASE_URI, RDFNotation.NTRIPLES, spec, null).close(); } private Iterator asIt(String file) throws ParserException { List triples = new ArrayList<>(); RDFNotation notation = RDFNotation.guess(file); RDFParserCallback parser = RDFParserFactory.getParserCallback(notation); - parser.doParse(file, HDTTestUtils.BASE_URI, notation, true, new RDFParserCallback.RDFCallback() { - @Override - public void processTriple(TripleString triple, long pos) { - // force duplication of the triple string data - triples.add(new TripleString( - triple.getSubject().toString(), - triple.getPredicate().toString(), - triple.getObject().toString() - )); - } + parser.doParse(file, HDTTestUtils.BASE_URI, notation, true, (triple, pos) -> { + // force duplication of the triple string data + triples.add(new TripleString( + triple.getSubject().toString(), + triple.getPredicate().toString(), + triple.getObject().toString() + )); }); return triples.iterator(); } @Test - public void bNodeXStreamTest() throws ParserException, IOException, NotFoundException { - HDT hdt = HDTManager.generateHDT(asIt(getFile("importer/bnode_x.nt")), HDTTestUtils.BASE_URI, spec, null); - hdt.search("", "", "").forEachRemaining(System.out::println); - hdt.close(); + public void bNodeXStreamTest() throws ParserException, IOException { + HDTManager.generateHDT(asIt(getFile("importer/bnode_x.nt")), HDTTestUtils.BASE_URI, spec, null).close(); } @Test - public void bNodeZStreamTest() throws ParserException, IOException, NotFoundException { - HDT hdt = HDTManager.generateHDT(asIt(getFile("importer/bnode_z.nt")), HDTTestUtils.BASE_URI, spec, null); - hdt.search("", "", "").forEachRemaining(System.out::println); - hdt.close(); + public void bNodeZStreamTest() throws ParserException, IOException { + HDTManager.generateHDT(asIt(getFile("importer/bnode_z.nt")), HDTTestUtils.BASE_URI, spec, null).close(); } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatLiteralsTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatLiteralsTest.java index 0060b89b..803c107f 100755 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatLiteralsTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatLiteralsTest.java @@ -1,12 +1,18 @@ package org.rdfhdt.hdt.hdtCat; +import org.apache.commons.io.file.PathUtils; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.hdtCat.utils.Utility; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; @@ -15,9 +21,23 @@ import java.io.IOException; import java.net.URL; import java.nio.file.Files; +import java.util.Collection; +import java.util.List; +@RunWith(Parameterized.class) public class HdtCatLiteralsTest extends AbstractMapMemoryTest implements ProgressListener { + @Parameterized.Parameters(name = "{0}") + public static Collection params() { + return List.of(HDTOptionsKeys.LOAD_HDT_TYPE_VALUE_MAP, HDTOptionsKeys.LOAD_HDT_TYPE_VALUE_LOAD); + } + + @Parameterized.Parameter + public String loadingMethod; + + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + private void help(String filename1, String filename2, String concatFilename) throws ParserException, IOException { ClassLoader classLoader = getClass().getClassLoader(); URL resource = classLoader.getResource(filename1); @@ -37,18 +57,23 @@ private void help(String filename1, String filename2, String concatFilename) thr String concat = resource2.getFile(); - String hdt1Location = file1.replace(".nt", ".hdt"); - String hdt2Location = file2.replace(".nt", ".hdt"); + + String hdt1Location = tempDir.newFile().getAbsolutePath(); + String hdt2Location = tempDir.newFile().getAbsolutePath(); + HDTSpecification spec = new HDTSpecification(); - spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); + spec.set(HDTOptionsKeys.LOAD_HDT_TYPE_KEY, loadingMethod); + try (HDT hdt = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, this)) { hdt.saveToHDT(hdt1Location, null); } try (HDT hdt = HDTManager.generateHDT(new File(file2).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, this)) { hdt.saveToHDT(hdt2Location, null); } - File file = new File(file1); - File theDir = new File(file.getAbsolutePath() + "_tmp"); + File file = tempDir.newFile(); + File theDir = tempDir.newFolder(); Files.createDirectories(theDir.toPath()); try (HDT hdtCatNew = HDTManager.catHDT(theDir.getAbsolutePath(), hdt1Location, hdt2Location, spec, null)) { @@ -59,7 +84,8 @@ private void help(String filename1, String filename2, String concatFilename) thr HDT hdtCatNew = HDTManager.mapIndexedHDT(file.getAbsolutePath() + "_cat.hdt")) { Utility.compareCustomDictionary(hdtCatOld.getDictionary(), hdtCatNew.getDictionary()); Utility.compareTriples(hdtCatOld, hdtCatNew); - Files.delete(theDir.toPath()); + } finally { + PathUtils.deleteDirectory(theDir.toPath()); } } @@ -83,8 +109,8 @@ private void help(String filename1, String filename2, String concatFilename) thr // System.out.println(search.next()); // } // -// Iterator no_datatype1 = hdt1.getDictionary().getAllObjects().get("NO_DATATYPE").getSortedEntries(); -// Iterator no_datatype2 = hdt2.getDictionary().getAllObjects().get("NO_DATATYPE").getSortedEntries(); +// Iterator no_datatype1 = hdt1.getDictionary().getAllObjects().get(LiteralsUtils.NO_DATATYPE).getSortedEntries(); +// Iterator no_datatype2 = hdt2.getDictionary().getAllObjects().get(LiteralsUtils.NO_DATATYPE).getSortedEntries(); // while (no_datatype1.hasNext()){ // CharSequence next1 = no_datatype1.next(); // CharSequence next2 = no_datatype2.next(); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatRandomTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatRandomTest.java index 55afb42a..6746f731 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatRandomTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatRandomTest.java @@ -10,6 +10,7 @@ import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.hdtDiff.HdtDiffTest; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; @@ -38,8 +39,8 @@ public static Collection genParam() { public HdtCatRandomTest(String dictionaryType, String tempDictionaryImpl) { spec = new HDTSpecification(); - spec.set("dictionary.type", dictionaryType); - spec.set("tempDictionary.impl", tempDictionaryImpl); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, tempDictionaryImpl); } @Test diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/utils/Utility.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/utils/Utility.java index cda463d5..0ad25464 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/utils/Utility.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/utils/Utility.java @@ -4,6 +4,7 @@ import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.triples.IteratorTripleID; import org.rdfhdt.hdt.triples.TripleID; @@ -60,7 +61,7 @@ public static void printCustomDictionary(Dictionary d) { } System.out.println("OBJECTS"); count = 0; - for (Map.Entry stringDictionarySectionEntry : d.getAllObjects().entrySet()) { + for (Map.Entry stringDictionarySectionEntry : d.getAllObjects().entrySet()) { Iterator entries = stringDictionarySectionEntry.getValue().getSortedEntries(); while (entries.hasNext()) { System.out.println(count + "---" + entries.next().toString()); @@ -111,12 +112,12 @@ public static void compareCustomDictionary(Dictionary d1, Dictionary d2){ d1.getSubjects().getSortedEntries(), d2.getSubjects().getSortedEntries() ); - Iterator> hmIter1 = d1.getAllObjects().entrySet().iterator(); - Iterator> hmIter2 = d2.getAllObjects().entrySet().iterator(); + Iterator> hmIter1 = d1.getAllObjects().entrySet().iterator(); + Iterator> hmIter2 = d2.getAllObjects().entrySet().iterator(); while (hmIter1.hasNext()) { Assert.assertTrue("The dictionaries have a different number of objects subsections", hmIter2.hasNext()); - Map.Entry entry1 = hmIter1.next(); - Map.Entry entry2 = hmIter2.next(); + Map.Entry entry1 = hmIter1.next(); + Map.Entry entry2 = hmIter2.next(); assertEquals( entry1.getValue().getSortedEntries(), @@ -130,13 +131,10 @@ public static void compareCustomDictionary(Dictionary d1, Dictionary d2){ ); } public static void printTriples(HDT hdt){ - IteratorTripleID it = hdt.getTriples().searchAll(); - while (it.hasNext()){ - TripleID tripleIDOld = it.next(); - String subject = hdt.getDictionary().idToString(tripleIDOld.getSubject(), TripleComponentRole.SUBJECT).toString(); - String predicate = hdt.getDictionary().idToString(tripleIDOld.getPredicate(), TripleComponentRole.PREDICATE).toString(); - String object = hdt.getDictionary().idToString(tripleIDOld.getObject(), TripleComponentRole.OBJECT).toString(); - System.out.println(subject+"--"+predicate+"--"+object); + try { + hdt.search("", "", "").forEachRemaining(s -> System.out.println(s.getSubject()+"--"+s.getPredicate()+"--"+s.getObject())); + } catch (NotFoundException e) { + throw new AssertionError(e); } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffStaticTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffStaticTest.java index e077a09d..d5526ae0 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffStaticTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffStaticTest.java @@ -10,6 +10,7 @@ import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; @@ -37,8 +38,8 @@ public static Collection genParam() { public HdtDiffStaticTest(String dictionaryType, String tempDictionaryImpl) { spec = new HDTSpecification(); - spec.set("dictionary.type", dictionaryType); - spec.set("tempDictionary.impl", tempDictionaryImpl); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, tempDictionaryImpl); } private void ntFilesDiffTest(String a, String b, String amb) throws IOException, ParserException { diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java index 22c2d68b..d5ff8912 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java @@ -15,6 +15,7 @@ import org.rdfhdt.hdt.hdt.writer.TripleWriterHDT; import org.rdfhdt.hdt.hdtDiff.utils.EmptyBitmap; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.triples.IteratorTripleString; import org.rdfhdt.hdt.triples.TripleString; @@ -49,10 +50,10 @@ public static class HDTDiffData { } public static final DictionaryTestData[] DICTIONARY_TEST_DATA = { - new DictionaryTestData(HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION, DictionaryFactory.MOD_DICT_IMPL_HASH), - new DictionaryTestData(DictionaryFactory.DICTIONARY_TYPE_FOUR_SECTION_BIG, DictionaryFactory.MOD_DICT_IMPL_HASH), - new DictionaryTestData(DictionaryFactory.DICTIONARY_TYPE_MULTI_OBJECTS, DictionaryFactory.MOD_DICT_IMPL_MULT_HASH), - new DictionaryTestData(HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION, DictionaryFactory.MOD_DICT_IMPL_HASH_PSFC) + new DictionaryTestData(HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH), + new DictionaryTestData(HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH), + new DictionaryTestData(HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH), + new DictionaryTestData(HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC) }; /** @@ -136,7 +137,7 @@ public static HDTDiffData createTestHDT(File fileName, File fileName2, File file return count; } - @Parameterized.Parameters(name = "{0}") + @Parameterized.Parameters(name = "{0} ({2},{3},{4},{5})") public static Collection genParam() { List list = new ArrayList<>(); for (DictionaryTestData data : DICTIONARY_TEST_DATA) { @@ -200,8 +201,8 @@ public static void assertHdtEquals(HDT hdt1, HDT hdt2) { public HdtDiffTest(String dictionaryType, String tempDictionaryImpl, int subjects, int predicates, int objects, int shared) { spec = new HDTSpecification(); - spec.set("dictionary.type", dictionaryType); - spec.set("tempDictionary.impl", tempDictionaryImpl); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, tempDictionaryImpl); this.subjects = subjects; this.predicates = predicates; this.objects = objects; diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java index 097192a5..e96fdcda 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java @@ -301,7 +301,6 @@ public void deepMergeComputeTest() { while (it.hasNext()) { assertTrue(itE.hasNext()); CharSequence seq = ((AssertionCharSequence) it.next().getNode()).getSequence(); - System.out.println(seq); assertEquals(itE.next(), seq); } assertFalse(itE.hasNext()); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java index 00b28993..e9d3330f 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java @@ -5,6 +5,7 @@ import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; @@ -16,6 +17,7 @@ import java.io.File; import java.io.IOException; import java.util.AbstractMap; +import java.util.Objects; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; @@ -23,13 +25,12 @@ public class HDTLiteralsDictTest { @Test - public void testIdConversion(){ + public void testIdConversion() throws ParserException, IOException, NotFoundException { ClassLoader classLoader = getClass().getClassLoader(); - String file1 = classLoader.getResource("example4+5.nt").getFile(); + String file1 = Objects.requireNonNull(classLoader.getResource("example4+5.nt")).getFile(); HDTSpecification spec = new HDTSpecification(); spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); - try { - HDT hdt1 = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null); + try (HDT hdt1 = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null)) { IteratorTripleString iterator = hdt1.search("","",""); while (iterator.hasNext()){ TripleString next = iterator.next(); @@ -47,18 +48,15 @@ public void testIdConversion(){ String obj = hdt1.getDictionary().idToString(objId,TripleComponentRole.OBJECT).toString(); assertEquals(next.getObject(), obj); } - } catch (Exception e) { - e.printStackTrace(); } } @Test - public void testGetDataTypeRange(){ + public void testGetDataTypeRange() throws IOException, ParserException{ ClassLoader classLoader = getClass().getClassLoader(); - String file1 = classLoader.getResource("example22.nt").getFile(); + String file1 = Objects.requireNonNull(classLoader.getResource("example22.nt")).getFile(); HDTSpecification spec = new HDTSpecification(); spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); - try { - HDT hdt = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null); + try (HDT hdt = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null)) { Dictionary dictionary = hdt.getDictionary(); AbstractMap.SimpleEntry dataTypeRange = ((MultipleSectionDictionary) dictionary).getDataTypeRange("http://www.w3.org/2001/XMLSchema#float"); long lower = dataTypeRange.getKey(); @@ -66,35 +64,22 @@ public void testGetDataTypeRange(){ Utility.printTriples(hdt); assertEquals(5,lower); assertEquals(7,upper); - - } catch (IOException e) { - e.printStackTrace(); - } catch (ParserException e) { - e.printStackTrace(); } } @Test - public void testGetDataTypeOfId(){ + public void testGetDataTypeOfId() throws IOException, ParserException{ ClassLoader classLoader = getClass().getClassLoader(); - String file1 = classLoader.getResource("example22.nt").getFile(); + String file1 = Objects.requireNonNull(classLoader.getResource("example22.nt")).getFile(); HDTSpecification spec = new HDTSpecification(); spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); - try { - HDT hdt = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null); + try (HDT hdt = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null)) { Dictionary dictionary = hdt.getDictionary(); // first get the id of a given string - long id = dictionary.stringToId("\"Ali Haidar\"@en",TripleComponentRole.OBJECT); + long id = dictionary.stringToId("\"Ali Haidar\"@en", TripleComponentRole.OBJECT); // by default of there is no string datatype in the rdf file, the dictionary will create a section for the // strings - assertEquals("",dictionary.dataTypeOfId(id)); - - - - } catch (IOException e) { - e.printStackTrace(); - } catch (ParserException e) { - e.printStackTrace(); + assertEquals("", dictionary.dataTypeOfId(id).toString()); } } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/RDFFluxStopTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/RDFFluxStopTest.java index 1af5212c..015323c0 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/RDFFluxStopTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/RDFFluxStopTest.java @@ -8,7 +8,13 @@ public class RDFFluxStopTest { private void assertExportSame(RDFFluxStop flux) { - assertEquals(flux, RDFFluxStop.readConfig(flux.asConfig())); + String cfg = flux.asConfig(); + RDFFluxStop readCfg = RDFFluxStop.readConfig(cfg); + assertEquals(flux, readCfg); + String cfg2 = readCfg.asConfig(); + assertEquals(cfg, cfg2); + RDFFluxStop readCfg2 = RDFFluxStop.readConfig(cfg); + assertEquals(flux, readCfg2); } @Test @@ -17,6 +23,8 @@ public void optionTest() { assertEquals(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":42", RDFFluxStop.countLimit(42).asConfig()); assertEquals(HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":34", RDFFluxStop.sizeLimit(34).asConfig()); + assertEquals("()!(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":34)", RDFFluxStop.sizeLimit(34).not().asConfig()); + assertEquals( "(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_COUNT + ":42)&(" + HDTOptionsKeys.RDF_FLUX_STOP_VALUE_SIZE + ":34)", @@ -36,7 +44,8 @@ public void optionTest() { ); assertExportSame(RDFFluxStop.countLimit(42).or(RDFFluxStop.sizeLimit(34))); - assertExportSame((RDFFluxStop.countLimit(42).and(RDFFluxStop.countLimit(1))).or(RDFFluxStop.sizeLimit(34).and(RDFFluxStop.noLimit())).and(RDFFluxStop.countLimit(23))); + assertExportSame(RDFFluxStop.countLimit(42).or(RDFFluxStop.sizeLimit(34)).not()); + assertExportSame((RDFFluxStop.countLimit(42).and(RDFFluxStop.countLimit(1))).or(RDFFluxStop.sizeLimit(34).not().and(RDFFluxStop.noLimit())).and(RDFFluxStop.countLimit(23))); assertNull(RDFFluxStop.readConfig("")); assertNull(RDFFluxStop.readConfig(null)); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorDiffTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorDiffTest.java index fbe6f719..cf8b2b54 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorDiffTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorDiffTest.java @@ -8,12 +8,12 @@ import org.junit.runners.Parameterized; import org.rdfhdt.hdt.compact.bitmap.BitmapFactory; import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; -import org.rdfhdt.hdt.dictionary.DictionaryFactory; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.hdt.writer.TripleWriterHDT; import org.rdfhdt.hdt.hdtDiff.utils.TripleStringUtility; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.triples.IteratorTripleID; import org.rdfhdt.hdt.triples.TripleID; @@ -108,8 +108,8 @@ public static HDTDiffData createTestHDT(File fileName, File fileName2, int subje public static Collection genParam() { return Arrays.asList( HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION, - DictionaryFactory.DICTIONARY_TYPE_FOUR_SECTION_BIG, - DictionaryFactory.DICTIONARY_TYPE_MULTI_OBJECTS, + HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG, + HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION ); } @@ -120,7 +120,7 @@ public static Collection genParam() { public BitmapTriplesIteratorDiffTest(String dictionaryType) { spec = new HDTSpecification(); - spec.set("dictionary.type", dictionaryType); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); } @Test @@ -185,4 +185,4 @@ public void diffTest() throws IOException { } } } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorPositionTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorPositionTest.java index 811b9c96..de3227f0 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorPositionTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorPositionTest.java @@ -6,19 +6,16 @@ import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import org.rdfhdt.hdt.dictionary.DictionaryFactory; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; -import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.iterator.DictionaryTranslateIterator; import org.rdfhdt.hdt.iterator.DictionaryTranslateIteratorBuffer; import org.rdfhdt.hdt.iterator.SequentialSearchIteratorTripleID; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; -import org.rdfhdt.hdt.rdf.RDFParserCallback; -import org.rdfhdt.hdt.rdf.RDFParserFactory; import org.rdfhdt.hdt.triples.IteratorTripleString; import org.rdfhdt.hdt.triples.TripleID; import org.rdfhdt.hdt.triples.TripleString; @@ -34,11 +31,11 @@ @RunWith(Parameterized.class) public class BitmapTriplesIteratorPositionTest { - public static final List DICTONARIES = Arrays.asList( - HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION, - DictionaryFactory.DICTIONARY_TYPE_FOUR_SECTION_BIG, - DictionaryFactory.DICTIONARY_TYPE_MULTI_OBJECTS, - HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION + public static final List DICTIONARIES = List.of( + HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION, + HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG, + HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, + HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_PSFC_SECTION ); private static final Field ITERATOR_SUB; @@ -62,7 +59,7 @@ public class BitmapTriplesIteratorPositionTest { @Parameterized.Parameters(name = "{0}") public static Collection genParam() { List lst = new ArrayList<>(); - for (String dict :DICTONARIES){ + for (String dict : DICTIONARIES){ lst.add(new Object[] { dict, 25, 50, 37, 12 }); lst.add(new Object[] { dict, 25, 50, 37, 0 }); lst.add(new Object[] { dict, 25, 50, 0, 12 }); @@ -93,7 +90,7 @@ public static Collection genParam() { public BitmapTriplesIteratorPositionTest(String dictionaryType, int subjects, int predicates, int objects, int shared) { spec = new HDTSpecification(); - spec.set("dictionary.type", dictionaryType); + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); this.subjects = subjects; this.predicates = predicates; this.objects = objects; @@ -129,7 +126,7 @@ public void searchAllTest() throws IOException, NotFoundException { IteratorTripleString it = data.searchForSPO(0, 0, 0); - printIterator(it); + // printIterator(it); long index = 0L; while (it.hasNext()) { @@ -156,7 +153,7 @@ public void searchAllTestBuffer() throws IOException, NotFoundException { IteratorTripleString it = data.searchForSPO(0, 0, 0); - printIterator(it); + // printIterator(it); long index = 0L; while (it.hasNext()) { @@ -192,7 +189,7 @@ private void searchTest(int s, int p, int o) throws IOException, NotFoundExcepti IteratorTripleString it = data.searchForSPO(s, p, o); - printIterator(it); + // printIterator(it); while (it.hasNext()) { TripleString triple = it.next(); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/utils/HDTTestUtils.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/utils/HDTTestUtils.java index 8378d42f..a812f65c 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/utils/HDTTestUtils.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/utils/HDTTestUtils.java @@ -1,20 +1,32 @@ package org.rdfhdt.hdt.triples.impl.utils; import org.junit.Assert; +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.hdt.writer.TripleWriterHDT; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.triples.IteratorTripleString; import org.rdfhdt.hdt.triples.TripleID; import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; import java.io.Closeable; import java.io.File; import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; /** * class to generate a synthetic hdt for test purposes. @@ -27,6 +39,79 @@ public class HDTTestUtils implements Closeable { */ public static final String BASE_URI = "http://ex.org/"; + public static class Tuple { + public final T1 t1; + public final T2 t2; + + public Tuple(T1 t1, T2 t2) { + this.t1 = t1; + this.t2 = t2; + } + + public T1 getT1() { + return t1; + } + + public T2 getT2() { + return t2; + } + + @Override + public String toString() { + return getT1() + ", " + getT2(); + } + } + + public static class CoIterator implements Iterator> { + private final Iterator it1; + private final Iterator it2; + + private Tuple next; + + public CoIterator(Iterator it1, Iterator it2) { + this.it1 = it1; + this.it2 = it2; + } + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + T1 t1; + T2 t2; + + if (it1.hasNext()) { + t1 = it1.next(); + } else { + t1 = null; + } + + if (it2.hasNext()) { + t2 = it2.next(); + } else if (t1 == null) { + return false; + } else { + t2 = null; + } + + next = new Tuple<>(t1, t2); + return true; + } + + @Override + public Tuple next() { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } + } + public class SpoId { public final int s, p, o; @@ -48,6 +133,65 @@ public long getIndex() { } } + public static void printDictionary(HDT hdt) { + System.out.println("Dictionary"); + Dictionary dict = hdt.getDictionary(); + + Map sect; + + if (HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS.equals(dict.getType())) { + sect = dict.getAllObjects(); + } else { + Map sect2 = new TreeMap<>(CharSequenceComparator.getInstance()); + sect2.put("subjects", dict.getSubjects()); + sect2.put("predicates", dict.getPredicates()); + sect2.put("objects", dict.getObjects()); + sect2.put("shareds", dict.getShared()); + sect = sect2; + } + + sect.forEach((key, sec) -> { + System.out.println("--- " + key); + sec.getSortedEntries().forEachRemaining(System.out::println); + }); + } + + public static void printCoDictionary(HDT hdt, HDT hdt2) { + System.out.println("Dictionary"); + Dictionary dict = hdt.getDictionary(); + Dictionary dict2 = hdt2.getDictionary(); + + Map sect1; + Map sect2; + + if (HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS.equals(dict.getType())) { + sect1 = dict.getAllObjects(); + assertEquals(HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, dict2.getType()); + sect2 = dict2.getAllObjects(); + } else { + assertNotEquals(HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, dict2.getType()); + Map sect11 = new TreeMap<>(CharSequenceComparator.getInstance()); + sect11.put("subjects", dict.getSubjects()); + sect11.put("predicates", dict.getPredicates()); + sect11.put("objects", dict.getObjects()); + sect11.put("shareds", dict.getShared()); + sect1 = sect11; + + Map sect21 = new TreeMap<>(CharSequenceComparator.getInstance()); + sect21.put("subjects", dict2.getSubjects()); + sect21.put("predicates", dict2.getPredicates()); + sect21.put("objects", dict2.getObjects()); + sect21.put("shareds", dict2.getShared()); + sect2 = sect21; + } + + Set keys = sect1.keySet(); + assertEquals(keys, sect2.keySet()); + + keys.forEach(key -> new CoIterator<>(sect1.get(key).getSortedEntries(), sect2.get(key).getSortedEntries()) + .forEachRemaining(System.out::println)); + } + /** * memory hdt */ @@ -106,7 +250,7 @@ public HDTTestUtils(File f, int subjects, int predicates, int objects, int share } if (buffer) this.hdt = HDTManager.mapHDT(hdtFile.getAbsolutePath(), null, spec); else this.hdt = HDTManager.loadHDT(hdtFile.getAbsolutePath(), null, spec); - Assert.assertEquals("HDT count", triples, hdt.getTriples().getNumberOfElements()); + assertEquals("HDT count", triples, hdt.getTriples().getNumberOfElements()); this.triples = triples; } @@ -202,7 +346,7 @@ public SpoId tripleToSpo(TripleString triple) { public IteratorTripleString searchForSPO(int s, int p, int o) throws NotFoundException { TripleString tr = spoToTriple(s, p, o); - System.out.println("Search with pattern:" + (s == 0 ? "?" : "S") + (p == 0 ? "?" : "P") + (o == 0 ? "?" : "O")); + // System.out.println("Search with pattern:" + (s == 0 ? "?" : "S") + (p == 0 ? "?" : "P") + (o == 0 ? "?" : "O")); return hdt.search(tr.getSubject(), tr.getPredicate(), tr.getObject()); } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index 4ec5523b..f4d981db 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -9,6 +9,7 @@ import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.TripleString; import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.string.ByteStringUtil; @@ -27,6 +28,11 @@ import java.util.Random; import java.util.zip.GZIPOutputStream; +/** + * Utility class to create fake large dataset + * + * @author Antoine Willerval + */ public class LargeFakeDataSetStreamSupplier { private static final Charset DEFAULT_CHARSET = ByteStringUtil.STRING_ENCODING; @@ -37,17 +43,31 @@ public class LargeFakeDataSetStreamSupplier { * @param i id * @return string */ - public static String stringNameOfInt(int i) { - String table = "abcdefghijklmnopqrstuvwxyz"; + public static String stringNameOfInt(int i, boolean unicode) { StringBuilder out = new StringBuilder(); - int c = i; - do { - out.append(table.charAt(c % table.length())); - c /= table.length(); - } while (c != 0); + if (unicode) { + return "" + (char) (30 + Math.min(i, Character.MAX_VALUE - 30)); + } else { + String table = "abcdefghijklmnopqrstuvwxyz"; + int c = i; + do { + out.append(table.charAt(c % table.length())); + c /= table.length(); + } while (c != 0); + } return out.toString(); } + /** + * create a lowercase name from a number, to create string without any number in it + * + * @param i id + * @return string + */ + public static String stringNameOfInt(int i) { + return stringNameOfInt(i, false); + } + /** * estimate the size of a triple * @@ -62,10 +82,24 @@ public static long estimateTripleSize(TripleString triple) { } } + /** + * create a supplier with a max size + * + * @param maxSize the max size + * @param seed the seed of the supplier, the same seed will create the same supplier + * @return supplier + */ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxSize(long maxSize, long seed) { return new LargeFakeDataSetStreamSupplier(maxSize, Long.MAX_VALUE, seed); } + /** + * create a supplier with a max count + * + * @param maxTriples the max number of triples + * @param seed the seed of the supplier, the same seed will create the same supplier + * @return supplier + */ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long maxTriples, long seed) { return new LargeFakeDataSetStreamSupplier(Long.MAX_VALUE, maxTriples, seed); } @@ -77,6 +111,9 @@ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long m public int maxFakeType = 10; public int maxLiteralSize = 2; public int maxElementSplit = Integer.MAX_VALUE; + private long slowStream; + private boolean sameTripleString; + private boolean unicode; private LargeFakeDataSetStreamSupplier(long maxSize, long maxTriples, long seed) { this.maxSize = maxSize; @@ -85,22 +122,53 @@ private LargeFakeDataSetStreamSupplier(long maxSize, long maxTriples, long seed) reset(); } + /** + * reset the supplier like it was just created + */ public void reset() { random = new Random(seed); } + /** + * @return iterator of triples + */ public Iterator createTripleStringStream() { return new FakeStatementIterator(); } + /** + * create a nt file from the stream + * + * @param file the file to write + * @throws IOException io exception + * @see #createNTFile(java.nio.file.Path) + */ public void createNTFile(String file) throws IOException { - try (FileWriter writer = new FileWriter(file)) { + createNTFile(Path.of(file)); + } + + /** + * create a nt file from the stream + * + * @param file the file to write + * @throws IOException io exception + * @see #createNTFile(java.lang.String) + */ + public void createNTFile(Path file) throws IOException { + try (FileWriter writer = new FileWriter(file.toFile())) { for (Iterator it = createTripleStringStream(); it.hasNext(); ) { it.next().dumpNtriple(writer); } } } + /** + * create a threaded stream (to close!) with a particular compression + * + * @param compressionType compression type + * @return threaded stream + * @throws IOException io exception + */ public ThreadedStream createNTInputStream(CompressionType compressionType) throws IOException { PipedOutputStream pout = new PipedOutputStream(); InputStream is = new PipedInputStream(pout); @@ -141,36 +209,73 @@ public ThreadedStream createNTInputStream(CompressionType compressionType) throw return new ThreadedStream(run, is); } + /** + * create an HDT from the stream using two-pass algorithm + * + * @param spec hdt options + * @return hdt + * @throws ParserException parsing exception + * @throws IOException io exception + */ public HDT createFakeHDTTwoPass(HDTOptions spec) throws ParserException, IOException { Path f = Path.of("tempNtFile.nt").toAbsolutePath(); try { - createNTFile(f.toString()); - spec.set("loader.type", "two-pass"); + createNTFile(f); + spec.set(HDTOptionsKeys.LOADER_TYPE_KEY, HDTOptionsKeys.LOADER_TYPE_VALUE_TWO_PASS); return HDTManager.generateHDT(f.toString(), "http://w", RDFNotation.NTRIPLES, spec, null); } finally { Files.deleteIfExists(f); } } + + /** + * create an HDT from the stream + * + * @param spec hdt options + * @return hdt + * @throws ParserException parsing exception + * @throws IOException io exception + */ public HDT createFakeHDT(HDTOptions spec) throws ParserException, IOException { return HDTManager.generateHDT(createTripleStringStream(), "http://w", spec, null); } + /** + * create an HDT from the stream and save it to a file + * + * @param spec hdt options + * @param location save location + * @throws ParserException parsing exception + * @throws IOException io exception + */ public void createAndSaveFakeHDT(HDTOptions spec, String location) throws ParserException, IOException { try (HDT hdt = createFakeHDT(spec)) { hdt.saveToHDT(location, null); } } + + /** + * create an HDT from the stream using 2pass algorithm and save it to a file + * + * @param spec hdt options + * @param location save location + * @throws ParserException parsing exception + * @throws IOException io exception + */ public void createAndSaveFakeHDTTwoPass(HDTOptions spec, String location) throws ParserException, IOException { try (HDT hdt = createFakeHDTTwoPass(spec)) { hdt.saveToHDT(location, null); } } - private CharSequence createSubject() { - return createPredicate(); + private CharSequence createResource() { + if (random.nextInt(10) == 0) { + return "_:bnode" + random.nextInt(maxElementSplit / 10); + } + return createIRI(); } - private CharSequence createPredicate() { + private CharSequence createIRI() { return "http://w" + random.nextInt(maxElementSplit) + "i.test.org/#Obj" + random.nextInt(maxElementSplit); } @@ -180,12 +285,12 @@ private CharSequence createType() { private CharSequence createValue() { if (random.nextBoolean()) { - return createPredicate(); + return createResource(); } int size = random.nextInt(maxLiteralSize); StringBuilder litText = new StringBuilder(); for (int i = 0; i < size; i++) { - litText.append(stringNameOfInt(random.nextInt(maxElementSplit))).append(" "); + litText.append(stringNameOfInt(unicode ? random.nextInt(Character.MAX_VALUE - 30) : random.nextInt(maxElementSplit), unicode)); } String text = "\"" + litText + "\""; int litType = random.nextInt(3); @@ -204,8 +309,15 @@ private CharSequence createValue() { private class FakeStatementIterator implements Iterator { private long size; private long count; + private TripleString buffer; private TripleString next; + FakeStatementIterator() { + if (sameTripleString) { + buffer = new TripleString(); + } + } + @Override public boolean hasNext() { if (size >= maxSize || count >= maxTriples) { @@ -215,19 +327,34 @@ public boolean hasNext() { return true; } - next = new TripleString( - createSubject(), - createPredicate(), - createValue() - ); - - long estimation = estimateTripleSize( - new TripleString( - next.getSubject().toString(), - next.getPredicate().toString(), - next.getObject().toString() - ) - ); + CharSequence resource = createResource(); + CharSequence iri = createIRI(); + CharSequence value = createValue(); + + if (buffer != null) { + buffer.setAll( + resource, + iri, + value + ); + next = buffer; + } else { + next = new TripleString( + resource, + iri, + value + ); + } + + if (slowStream > 0) { + try { + Thread.sleep(slowStream); + } catch (InterruptedException e) { + throw new AssertionError(e); + } + } + + long estimation = estimateTripleSize(next); size += estimation; count++; @@ -245,22 +372,76 @@ public TripleString next() { } } + /** + * set the maximum number of fake type + * + * @param maxFakeType maximum number + * @return this + */ public LargeFakeDataSetStreamSupplier withMaxFakeType(int maxFakeType) { this.maxFakeType = maxFakeType; return this; } + /** + * set the maximum element split number + * + * @param maxElementSplit maximum number + * @return this + */ public LargeFakeDataSetStreamSupplier withMaxElementSplit(int maxElementSplit) { this.maxElementSplit = maxElementSplit; return this; } + /** + * set the maximum literal size + * + * @param maxLiteralSize maximum number + * @return this + */ public LargeFakeDataSetStreamSupplier withMaxLiteralSize(int maxLiteralSize) { this.maxLiteralSize = maxLiteralSize; return this; } + /** + * allow using unicode or not in the literals + * + * @param unicode unicode + * @return this + */ + public LargeFakeDataSetStreamSupplier withUnicode(boolean unicode) { + this.unicode = unicode; + return this; + } + + /** + * add a latency to the stream generation + * + * @param slowStream latency (millis) + * @return this + */ + public LargeFakeDataSetStreamSupplier withSlowStream(long slowStream) { + this.slowStream = slowStream; + return this; + } + + /** + * use the same {@link org.rdfhdt.hdt.triples.TripleString} object, better to simulate the RDFParser outputs + * + * @param sameTripleString use same triple + * @return this + */ + public LargeFakeDataSetStreamSupplier withSameTripleString(boolean sameTripleString) { + this.sameTripleString = sameTripleString; + return this; + } + + /** + * Stream connected to a thread to interrupt in case of Exception + */ public static class ThreadedStream { private final ExceptionThread thread; private final InputStream stream; @@ -270,10 +451,16 @@ public ThreadedStream(ExceptionThread thread, InputStream stream) { this.stream = stream; } + /** + * @return the thread + */ public ExceptionThread getThread() { return thread; } + /** + * @return the stream + */ public InputStream getStream() { return stream; } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java index e3cd825e..37fc5565 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java @@ -1,5 +1,6 @@ package org.rdfhdt.hdt.util; +import org.junit.Assert; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -16,6 +17,10 @@ import java.nio.file.Path; import java.util.Iterator; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + public class LargeFakeDataSetStreamSupplierTest { @Rule public TemporaryFolder tempDir = new TemporaryFolder(); @@ -25,7 +30,9 @@ public void streamTest() throws IOException { Path f = tempDir.newFolder().toPath(); Path testNt = f.resolve("test.nt"); triples.createNTFile(testNt.toAbsolutePath().toString()); + triples.reset(); + Iterator it2 = triples.createTripleStringStream(); try (InputStream is = Files.newInputStream(testNt)) { try (PipedCopyIterator it = RDFParserFactory.readAsIterator( RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES), @@ -35,14 +42,11 @@ public void streamTest() throws IOException { RDFNotation.NTRIPLES )) { it.forEachRemaining(s -> { - try { - Thread.sleep(50); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - System.out.println(s + " " + s.getSubject().getClass()); + assertTrue(it2.hasNext()); + assertEquals(it2.next(), s); }); + assertFalse(it.hasNext()); } } } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LiteralsUtilsTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LiteralsUtilsTest.java new file mode 100644 index 00000000..75c420a9 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LiteralsUtilsTest.java @@ -0,0 +1,105 @@ +package org.rdfhdt.hdt.util; + +import org.junit.Test; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.CompactString; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class LiteralsUtilsTest { + /** + * convert to compact string if required + * + * @param excepted excepted + * @param actual actual + */ + public static void assertEqualsCompact(CharSequence excepted, CharSequence actual) { + if (excepted instanceof String && actual instanceof String) { + assertEquals(excepted, actual); + return; + } + + if (excepted instanceof String) { + assertEquals(new CompactString(excepted), actual); + return; + } + if (actual instanceof String) { + assertEquals(excepted, new CompactString(actual)); + return; + } + + + assertEquals(0, CharSequenceComparator.getInstance().compare(excepted, actual)); + } + + @Test + public void containsLanguageTest() { + assertTrue(LiteralsUtils.containsLanguage("\"hello\"@fr")); + assertTrue(LiteralsUtils.containsLanguage("\"hello\"@fr-ca")); + assertFalse(LiteralsUtils.containsLanguage("\"hello\"^^")); + assertFalse(LiteralsUtils.containsLanguage("\"hello\"")); + assertFalse(LiteralsUtils.containsLanguage("")); + } + + @Test + public void removeTypeTest() { + assertEqualsCompact("\"hello\"@fr", LiteralsUtils.removeType("\"hello\"@fr")); + assertEqualsCompact("\"hello\"@fr-ca", LiteralsUtils.removeType("\"hello\"@fr-ca")); + assertEqualsCompact("\"hello\"", LiteralsUtils.removeType("\"hello\"^^")); + assertEqualsCompact("\"hello\"", LiteralsUtils.removeType("\"hello\"")); + assertEqualsCompact("", LiteralsUtils.removeType("")); + } + + @Test + public void getTypeTest() { + assertEqualsCompact(LiteralsUtils.LITERAL_LANG_TYPE, LiteralsUtils.getType("\"hello\"@fr")); + assertEqualsCompact(LiteralsUtils.LITERAL_LANG_TYPE, LiteralsUtils.getType("\"hello\"@fr-ca")); + assertEqualsCompact("", LiteralsUtils.getType("\"hello\"^^")); + assertEqualsCompact("", LiteralsUtils.getType("\"hello\"^^")); + assertEqualsCompact(LiteralsUtils.NO_DATATYPE, LiteralsUtils.getType("\"hello\"")); + assertEqualsCompact(LiteralsUtils.NO_DATATYPE, LiteralsUtils.getType("")); + } + + @Test + public void litStrTest() { + assertTrue(LiteralsUtils.isLangType(LiteralsUtils.LITERAL_LANG_TYPE, 0)); + assertTrue(LiteralsUtils.isLangType(LiteralsUtils.LITERAL_LANG_TYPE_STR, 0)); + } + + @Test + public void litToPrefTest() { + assertEqualsCompact("\"aaa\"", LiteralsUtils.litToPref("\"aaa\"")); + assertEqualsCompact("^^\"aaa\"", LiteralsUtils.litToPref("\"aaa\"^^")); + assertEqualsCompact("^^" + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr", LiteralsUtils.litToPref("\"aaa\"@fr-fr")); + + assertEqualsCompact("\"aaa\"", LiteralsUtils.litToPref(LiteralsUtils.prefToLit("\"aaa\""))); + assertEqualsCompact("^^\"aaa\"", LiteralsUtils.litToPref(LiteralsUtils.prefToLit("^^\"aaa\""))); + assertEqualsCompact("^^" + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr", LiteralsUtils.litToPref(LiteralsUtils.prefToLit("^^" + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr"))); + + assertEqualsCompact("", LiteralsUtils.litToPref("")); + } + + @Test + public void prefToLitTest() { + assertEqualsCompact("\"aaa\"", LiteralsUtils.litToPref("\"aaa\"")); + assertEqualsCompact("\"aaa\"^^", LiteralsUtils.prefToLit("^^\"aaa\"")); + assertEqualsCompact("\"aaa\"@fr-fr", LiteralsUtils.prefToLit("^^" + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr")); + assertEqualsCompact("", LiteralsUtils.prefToLit("")); + + assertEqualsCompact("\"aaa\"", LiteralsUtils.litToPref(LiteralsUtils.litToPref("\"aaa\""))); + assertEqualsCompact("\"aaa\"^^", LiteralsUtils.prefToLit(LiteralsUtils.litToPref("\"aaa\"^^"))); + assertEqualsCompact("\"aaa\"@fr-fr", LiteralsUtils.prefToLit(LiteralsUtils.litToPref("\"aaa\"@fr-fr"))); + assertEqualsCompact("", LiteralsUtils.prefToLit(LiteralsUtils.litToPref(""))); + } + + @Test + public void removePrefTypeTest() { + assertEqualsCompact("\"hello\"@fr", LiteralsUtils.removePrefType("\"hello\"@fr")); + assertEqualsCompact("\"hello\"@fr-ca", LiteralsUtils.removePrefType("\"hello\"@fr-ca")); + assertEqualsCompact("\"hello\"", LiteralsUtils.removePrefType("^^\"hello\"")); + assertEqualsCompact("\"hello\"", LiteralsUtils.removePrefType("\"hello\"")); + assertEqualsCompact("", LiteralsUtils.removePrefType("")); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java index b6a85e98..b2d3edee 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java @@ -20,10 +20,10 @@ public void writeReadTest() throws InterruptedException, IOException { out.connect(in); List nodes = Arrays.asList( new IndexedNode("bob", 1), - new IndexedNode("michel", 3), + new IndexedNode("charles", 6), new IndexedNode("jack", 2), - new IndexedNode("charles", 6) - ); + new IndexedNode("michel", 3) + ); new ExceptionThread(() -> { CompressNodeReader reader = new CompressNodeReader(in); Assert.assertEquals(nodes.size(), reader.getSize()); @@ -67,10 +67,10 @@ public void writeReadUtilTest() throws InterruptedException, IOException { out.connect(in); List nodes = Arrays.asList( new IndexedNode("bob", 1), - new IndexedNode("michel", 3), + new IndexedNode("charles", 6), new IndexedNode("jack", 2), - new IndexedNode("charles", 6) - ); + new IndexedNode("michel", 3) + ); new ExceptionThread(() -> { CompressNodeReader reader = new CompressNodeReader(in); Assert.assertEquals(nodes.size(), reader.getSize()); @@ -109,10 +109,10 @@ public void writeReadPassTest() throws InterruptedException, IOException { out.connect(in); List nodes = Arrays.asList( new IndexedNode("bob", 1), - new IndexedNode("michel", 3), + new IndexedNode("charles", 6), new IndexedNode("jack", 2), - new IndexedNode("charles", 6) - ); + new IndexedNode("michel", 3) + ); new ExceptionThread(() -> { CompressNodeReader reader = new CompressNodeReader(in); Assert.assertEquals(nodes.size(), reader.getSize()); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java index 1ba68115..6354f4dd 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java @@ -4,6 +4,7 @@ import org.junit.Test; import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import java.util.Arrays; @@ -50,7 +51,11 @@ public void noDupeTest() { duplicates.add(5L); Iterator actual = CompressUtil.asNoDupeCharSequenceIterator( - ExceptionIterator.of(duplicatedList.iterator()), + ExceptionIterator.of(duplicatedList.iterator()) + .map(in -> { + in.setNode(ByteString.of(in.getNode())); + return in; + }), (originalIndex, duplicatedIndex, oldIndex) -> Assert.assertTrue(duplicates.remove(duplicatedIndex)) ); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequenceTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequenceTest.java index 7ef6efd8..250e2f7e 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequenceTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequenceTest.java @@ -2,6 +2,8 @@ import org.junit.Test; +import java.util.Comparator; + import static org.junit.Assert.*; public class AssertionCharSequenceTest { diff --git a/hdt-java-core/src/test/resources/HdtCatLiteralsTest.java b/hdt-java-core/src/test/resources/HdtCatLiteralsTest.java index 08cc1f62..eb5a19b1 100755 --- a/hdt-java-core/src/test/resources/HdtCatLiteralsTest.java +++ b/hdt-java-core/src/test/resources/HdtCatLiteralsTest.java @@ -102,8 +102,8 @@ public void help(String file1, String file2, String concat){ // System.out.println(search.next()); // } // -// Iterator no_datatype1 = hdt1.getDictionary().getAllObjects().get("NO_DATATYPE").getSortedEntries(); -// Iterator no_datatype2 = hdt2.getDictionary().getAllObjects().get("NO_DATATYPE").getSortedEntries(); +// Iterator no_datatype1 = hdt1.getDictionary().getAllObjects().get(LiteralsUtils.NO_DATATYPE).getSortedEntries(); +// Iterator no_datatype2 = hdt2.getDictionary().getAllObjects().get(LiteralsUtils.NO_DATATYPE).getSortedEntries(); // while (no_datatype1.hasNext()){ // CharSequence next1 = no_datatype1.next(); // CharSequence next2 = no_datatype2.next(); diff --git a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/DummyMap.java b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/DummyMap.java index 5bfc6891..858386fb 100644 --- a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/DummyMap.java +++ b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/DummyMap.java @@ -8,7 +8,7 @@ public class DummyMap implements Map { @SuppressWarnings("rawtypes") - private static DummyMap instance= new DummyMap(); + private static final DummyMap instance= new DummyMap(); @SuppressWarnings("unchecked") public static Map getInstance() { diff --git a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/HDTGraph.java b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/HDTGraph.java index 09ec868e..1e3c5c9a 100644 --- a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/HDTGraph.java +++ b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/HDTGraph.java @@ -55,11 +55,11 @@ public class HDTGraph extends GraphBase { private static final HDTCapabilities capabilities= new HDTCapabilities(); - private HDT hdt; - private NodeDictionary nodeDictionary; - private ReorderTransformation reorderTransform; + private final HDT hdt; + private final NodeDictionary nodeDictionary; + private final ReorderTransformation reorderTransform; private long numSearches; - private boolean closeAfter; + private final boolean closeAfter; static { // Register OpExecutor diff --git a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/HDTGraphAssembler.java b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/HDTGraphAssembler.java index f654f237..e8e19f37 100644 --- a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/HDTGraphAssembler.java +++ b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/HDTGraphAssembler.java @@ -74,7 +74,7 @@ public Model open(Assembler a, Resource root, Mode mode) return ModelFactory.createModelForGraph(graph); } catch (IOException e) { log.error("Error reading HDT file: {}", file, e); - throw new AssemblerException(root, "Error reading HDT file: "+file+" / "+e.toString()); + throw new AssemblerException(root, "Error reading HDT file: "+file+" / "+ e); } } diff --git a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/NodeDictionary.java b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/NodeDictionary.java index afb265b4..f11b868a 100644 --- a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/NodeDictionary.java +++ b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/NodeDictionary.java @@ -56,10 +56,10 @@ public class NodeDictionary { private final Dictionary dictionary; @SuppressWarnings("unchecked") - private final DictionaryCache cacheIDtoNode [] = new DictionaryCache[TripleComponentRole.values().length]; + private final DictionaryCache[] cacheIDtoNode = new DictionaryCache[TripleComponentRole.values().length]; @SuppressWarnings("unchecked") - Map cacheNodeToId [] = new Map[TripleComponentRole.values().length]; + Map[] cacheNodeToId = new Map[TripleComponentRole.values().length]; public NodeDictionary(Dictionary dictionary) { this.dictionary = dictionary; @@ -67,21 +67,21 @@ public NodeDictionary(Dictionary dictionary) { // ID TO NODE final int idToNodeSize = 20000; if(dictionary.getNsubjects()>idToNodeSize) { - cacheIDtoNode[0] = new DictionaryCacheLRI(idToNodeSize); + cacheIDtoNode[0] = new DictionaryCacheLRI<>(idToNodeSize); } else { - cacheIDtoNode[0] = new DictionaryCacheArray((int) dictionary.getNsubjects()); + cacheIDtoNode[0] = new DictionaryCacheArray<>((int) dictionary.getNsubjects()); } if(dictionary.getNpredicates()>idToNodeSize) { - cacheIDtoNode[1] = new DictionaryCacheLRI(idToNodeSize); + cacheIDtoNode[1] = new DictionaryCacheLRI<>(idToNodeSize); } else { - cacheIDtoNode[1] = new DictionaryCacheArray((int) dictionary.getNpredicates()); + cacheIDtoNode[1] = new DictionaryCacheArray<>((int) dictionary.getNpredicates()); } if(dictionary.getNobjects()>idToNodeSize) { - cacheIDtoNode[2] = new DictionaryCacheLRI(idToNodeSize); + cacheIDtoNode[2] = new DictionaryCacheLRI<>(idToNodeSize); } else { - cacheIDtoNode[2] = new DictionaryCacheArray((int) dictionary.getNobjects()); + cacheIDtoNode[2] = new DictionaryCacheArray<>((int) dictionary.getNobjects()); } // NODE TO ID @@ -192,11 +192,11 @@ public TripleID getTriplePatID(Triple jenaTriple) { } public static PrefixMapping getMapping(ExecutionContext ctx) { - Query query = (Query) ctx.getContext().get(ARQConstants.sysCurrentQuery); + Query query = ctx.getContext().get(ARQConstants.sysCurrentQuery); return query.getPrefixMapping(); } - public static final Var asVar(Node node) + public static Var asVar(Node node) { if ( Var.isVar(node) ) return Var.alloc(node) ; diff --git a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/HDTOptimizedOp.java b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/HDTOptimizedOp.java index d396e3d4..e44f34f1 100644 --- a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/HDTOptimizedOp.java +++ b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/HDTOptimizedOp.java @@ -36,9 +36,7 @@ public int hashCode() { @Override public boolean equalTo(Op other, NodeIsomorphismMap labelMap) { - if(other instanceof HDTOptimizedOp) - return true; - return false; + return other instanceof HDTOptimizedOp; } } diff --git a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/OpExecutorHDT.java b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/OpExecutorHDT.java index d7d45612..6e5ff01e 100644 --- a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/OpExecutorHDT.java +++ b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/OpExecutorHDT.java @@ -57,14 +57,7 @@ public class OpExecutorHDT extends OpExecutor { - public final static OpExecutorFactory opExecFactoryHDT = new OpExecutorFactory() - { - @Override - public OpExecutor create(ExecutionContext execCxt) - { - return new OpExecutorHDT(execCxt) ; - } - }; + public final static OpExecutorFactory opExecFactoryHDT = OpExecutorHDT::new; private final boolean isForHDT; @@ -152,7 +145,7 @@ private static QueryIterator optimizeExecuteTriples(HDTGraph graph, } // -- Filter placement - Op op = null ; + Op op ; if ( exprs != null ) op = TransformFilterPlacement.transform(exprs, pattern) ; else @@ -216,11 +209,10 @@ private static class OpExecutorPlainHDT extends OpExecutor { final Predicate> filter; - @SuppressWarnings("unchecked") public OpExecutorPlainHDT(ExecutionContext execCxt) { super(execCxt) ; - filter = (Predicate>)execCxt.getContext().get(HDTJenaConstants.FILTER_SYMBOL); + filter = execCxt.getContext().get(HDTJenaConstants.FILTER_SYMBOL); } @Override diff --git a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/StageMatchTripleID.java b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/StageMatchTripleID.java index ef42160d..dcf25423 100644 --- a/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/StageMatchTripleID.java +++ b/hdt-jena/src/main/java/org/rdfhdt/hdtjena/solver/StageMatchTripleID.java @@ -26,13 +26,7 @@ package org.rdfhdt.hdtjena.solver; -import java.util.Iterator; -import java.util.Map; -import java.util.function.Function; -import java.util.function.Predicate; - import org.apache.jena.atlas.iterator.Iter; - import org.apache.jena.graph.Node; import org.apache.jena.graph.Triple; import org.apache.jena.shared.PrefixMapping; @@ -50,9 +44,14 @@ import org.rdfhdt.hdtjena.bindings.HDTId; import org.rdfhdt.hdtjena.util.VarAppearance; +import java.util.Iterator; +import java.util.Map; +import java.util.function.Function; + /** * For each input binding, emits all tuples matching a triple pattern. See {@link QueryIterTriplePattern}. */ +@SuppressWarnings("deprecation") public class StageMatchTripleID extends RepeatApplyIterator { @@ -61,10 +60,8 @@ public class StageMatchTripleID extends RepeatApplyIterator private final NodeDictionary dictionary ; private final Triples triples; private final TripleID patternID; - - private final PrefixMapping prefixMap; - - // Variables for this tuple after substitution + + // Variables for this tuple after substitution private final Var[] var = new Var[3]; private final boolean[] varIsSO = new boolean[3]; private final long numSharedSO; @@ -74,7 +71,7 @@ public StageMatchTripleID(HDTGraph graph, Iterator input, Triple p super(input); this.dictionary = graph.getNodeDictionary(); this.triples = graph.getHDT().getTriples(); - this.prefixMap = NodeDictionary.getMapping(execCxt); + PrefixMapping prefixMap = NodeDictionary.getMapping(execCxt); this.numSharedSO = graph.getHDT().getDictionary().getNshared(); // Convert Nodes to a TripleID @@ -138,42 +135,31 @@ protected Iterator makeNextStage(final BindingHDTId input) // Filter triples where S or O need to be shared. if(varIsSO[0] || varIsSO[2]) { - it = it.filter(new Predicate() { - @Override - public boolean test(TripleID t) { - if(varIsSO[0] && t.getSubject()>numSharedSO) { - return false; - } - if(varIsSO[2] && t.getObject()>numSharedSO) { - return false; - } - return true; - } - }); + it = it.filter(t -> { + if(varIsSO[0] && t.getSubject()>numSharedSO) { + return false; + } + return !varIsSO[2] || t.getObject() <= numSharedSO; + }); } // Map TripleID to BindingHDTId - Function binder = new Function() - { - @Override - public BindingHDTId apply(TripleID triple) - { - BindingHDTId output = new BindingHDTId(input) ; + Function binder = triple -> { + BindingHDTId output = new BindingHDTId(input) ; - if (var[0] != null && !insert(var[0], new HDTId(triple.getSubject(), TripleComponentRole.SUBJECT, dictionary), output)) { - return null; - } - if (var[1] != null && !insert(var[1], new HDTId(triple.getPredicate(), TripleComponentRole.PREDICATE, dictionary), output)) { - return null; - } - if (var[2] != null && !insert(var[2], new HDTId(triple.getObject(), TripleComponentRole.OBJECT, dictionary), output)) { - return null; - } + if (!(var[0] == null || insert(var[0], new HDTId(triple.getSubject(), TripleComponentRole.SUBJECT, dictionary), output))) { + return null; + } + if (!(var[1] == null || insert(var[1], new HDTId(triple.getPredicate(), TripleComponentRole.PREDICATE, dictionary), output))) { + return null; + } + if (!(var[2] == null || insert(var[2], new HDTId(triple.getObject(), TripleComponentRole.OBJECT, dictionary), output))) { + return null; + } - return output; - } - }; + return output; + }; return it.map(binder).removeNulls(); } From 549865ffca84363088a817684e3c4723ffa45618 Mon Sep 17 00:00:00 2001 From: qaate47 Date: Fri, 4 Nov 2022 17:58:10 +0100 Subject: [PATCH 5/9] add multithread logs --- .../src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index a527dc52..59395405 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -106,6 +106,9 @@ private static long getMaxTreeCatChunkSize() { @Parameter(names = "-cattreelocation", description = "Only with -cattree, set the tree building location") public String catTreeLocation; + @Parameter(names = "-multithread", description = "Use multithread logger") + public boolean multiThreadLog; + private static long findBestMemoryChunkDiskMapTreeCat() { Runtime runtime = Runtime.getRuntime(); long maxRam = (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85) / 3; @@ -228,7 +231,10 @@ public void execute() throws ParserException, IOException { listenerConsole.notifyProgress(100, "done"); } } else { - hdt = HDTManager.generateHDT(rdfInput, baseURI, notation, spec, this); + ProgressListener listenerConsole = + !quiet ? (multiThreadLog ? new MultiThreadListenerConsole() : this) + : null; + hdt = HDTManager.generateHDT(rdfInput, baseURI, notation, spec, listenerConsole); } System.out.println("File converted in: "+sw.stopAndShow()); From 73f7d2b23d71774f6d0e381d81b41cbef84f08de Mon Sep 17 00:00:00 2001 From: qaate47 Date: Tue, 8 Nov 2022 10:24:53 +0100 Subject: [PATCH 6/9] add unicode test and key print --- .../rdfhdt/hdt/options/HDTOptionsKeys.java | 126 ++++- .../main/java/org/rdfhdt/hdt/options/Key.java | 25 + .../java/org/rdfhdt/hdt/options/Value.java | 10 + .../org/rdfhdt/hdt/util/UnicodeEscape.java | 8 +- .../java/org/rdfhdt/hdt/tools/RDF2HDT.java | 60 ++- .../listener/MultiThreadListenerConsole.java | 41 +- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 2 +- .../rdfhdt/hdt/util/string/ByteString.java | 1 + .../hdt/utils/DebugOrderNodeIterator.java | 10 +- .../org/rdfhdt/hdt/hdt/HDTManagerTest.java | 15 + .../rdfhdt/hdt/util/UnicodeEscapeTest.java | 65 +++ .../hdt/util/string/ByteStringTest.java | 48 ++ .../src/test/resources/unicodeTest.nt | 506 ++++++++++++++++++ 13 files changed, 896 insertions(+), 21 deletions(-) create mode 100644 hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java create mode 100644 hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/UnicodeEscapeTest.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/ByteStringTest.java create mode 100644 hdt-java-core/src/test/resources/unicodeTest.nt diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java index 8ae50d20..3a41dcc3 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java @@ -3,8 +3,17 @@ import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.rdf.RDFFluxStop; +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + /** * keys usable with {@link org.rdfhdt.hdt.options.HDTOptions#set(String, String)} + * * @author Antoine Willerval */ public class HDTOptionsKeys { @@ -13,58 +22,69 @@ public class HDTOptionsKeys { * Value can be {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} or * {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} */ + @Key(type = Key.Type.ENUM, desc = "Compression mode") public static final String LOADER_DISK_COMPRESSION_MODE_KEY = "loader.disk.compressMode"; /** * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower * but decrease the RAM usage. default config. */ + @Value(value = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort all the file before going to the next step, slower but decrease the RAM usage. default config") public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete"; /** * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next * step, faster but increase the RAM usage. */ + @Value(value = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort while reading all the file before going to the next step, faster but increase the RAM usage.") public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial"; /** * Key for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, * say the number of workers to merge the data. default to the number of processor. long value. */ + @Key(type = Key.Type.NUMBER, desc = "Number of core used to compress the HDT") public static final String LOADER_DISK_COMPRESSION_WORKER_KEY = "loader.disk.compressWorker"; /** * Key for the maximum size of a chunk on disk for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk * methods, the chunk should be in RAM before writing it on disk and should be sorted. long value. */ + @Key(type = Key.Type.NUMBER, desc = "Maximum size of a chunk") public static final String LOADER_DISK_CHUNK_SIZE_KEY = "loader.disk.chunkSize"; /** * Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to * set this option to delete the directory in case of an interruption of the process. file value. */ + @Key(type = Key.Type.PATH, desc = "Location of the disk generation directory") public static final String LOADER_DISK_LOCATION_KEY = "loader.disk.location"; /** * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value. */ + @Key(type = Key.Type.PATH, desc = "Location of the future HDT") public static final String LOADER_DISK_FUTURE_HDT_LOCATION_KEY = "loader.disk.futureHDTLocation"; /** * Key for the maximum number of file opened at the same time, should be greater than {@link #LOADER_DISK_KWAY_KEY}, * 1024 by default */ + @Key(type = Key.Type.NUMBER, desc = "Maximum number of file HDTDisk can open at the same time") public static final String LOADER_DISK_MAX_FILE_OPEN_KEY = "loader.disk.maxFileOpen"; /** * Key for the number of chunk layers opened at the same time, by default *

min(log2(maxFileOpen), chunkSize / (fileBufferSize * compressWorker))

*/ + @Key(type = Key.Type.NUMBER, desc = "log of the number of way the system can merge in genDisk") public static final String LOADER_DISK_KWAY_KEY = "loader.disk.kway"; /** * Key for the size of the buffers when opening a file */ + @Key(type = Key.Type.NUMBER, desc = "Size of the file buffers") public static final String LOADER_DISK_BUFFER_SIZE_KEY = "loader.disk.fileBufferSize"; /** * Key for {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDTDisk(java.util.Iterator, String, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)}, * specify that the method doesn't have to copy the triple strings between 2 calls to the iterator, default false */ + @Key(type = Key.Type.BOOLEAN, desc = "specify that the method doesn't have to copy the triple strings between 2 calls to the iterator") public static final String LOADER_DISK_NO_COPY_ITERATOR_KEY = "loader.disk.noCopyIterator"; /** @@ -73,22 +93,27 @@ public class HDTOptionsKeys { * method, this key isn't working with the other methods. * Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS}, {@link #LOADER_TYPE_VALUE_TWO_PASS}, {@link #LOADER_TYPE_VALUE_CAT} or {@link #LOADER_TYPE_VALUE_DISK}. */ + @Key(type = Key.Type.ENUM, desc = "HDT generation loader type") public static final String LOADER_TYPE_KEY = "loader.type"; /** * Value for {@link #LOADER_TYPE_KEY}, read using disk generation, reduce the RAM usage and increase disk usage */ + @Value(value = LOADER_TYPE_KEY, desc = "Using genDisk") public static final String LOADER_TYPE_VALUE_DISK = "disk"; /** * Value for {@link #LOADER_TYPE_KEY}, read using HDTCat generation, merge using HDTCat HDT, reduce the RAM usage */ + @Value(value = LOADER_TYPE_KEY, desc = "Using HDTCat") public static final String LOADER_TYPE_VALUE_CAT = "cat"; /** * Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage */ + @Value(value = LOADER_TYPE_KEY, desc = "Using two pass algorithm") public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass"; /** * Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value */ + @Value(value = LOADER_TYPE_KEY, desc = "Using one pass algorithm") public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass"; /** @@ -96,38 +121,46 @@ public class HDTOptionsKeys { * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to * set this option to delete the directory in case of an interruption of the process. file value. */ + @Key(type = Key.Type.PATH, desc = "Path of the CatTree generation") public static final String LOADER_CATTREE_LOCATION_KEY = "loader.cattree.location"; /** * Same as {@link #LOADER_TYPE_KEY} for loader in the CATTREE method */ + @Key(desc = "Loader of the hdt generation") public static final String LOADER_CATTREE_LOADERTYPE_KEY = "loader.cattree.loadertype"; /** * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} catTree methods, * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value. */ + @Key(type = Key.Type.PATH, desc = "Location of the future HDT") public static final String LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY = "loader.cattree.futureHDTLocation"; /** * Key for the fault factor for the {@link org.rdfhdt.hdt.hdt.HDTManager} catTree default value of the * split size of the RDFFluxStop in the generateHDT method. */ + @Key(type = Key.Type.DOUBLE, desc = "Memory fault factor for HDTCat tree method split") public static final String LOADER_CATTREE_MEMORY_FAULT_FACTOR = "loader.cattree.memoryFaultFactor"; /** * Key for the hdt supplier type, default to memory */ + @Key(type = Key.Type.ENUM, desc = "HDTCat supplier type") public static final String HDT_SUPPLIER_KEY = "supplier.type"; /** * Value for {@link #HDT_SUPPLIER_KEY}, use HDTGenDisk to create the HDT */ + @Value(value = HDT_SUPPLIER_KEY, desc = "using genDisk") public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_DISK = "disk"; /** * Value for {@link #HDT_SUPPLIER_KEY}, use the default memory implementation to create the HDT */ + @Value(value = HDT_SUPPLIER_KEY, desc = "using gen in memory") public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_MEMORY = "memory"; /** * Key for the rdf flux stop type, default to the maximum memory allocated */ + @Key(desc = "API use") public static final String RDF_FLUX_STOP_KEY = "rdffluxstop.type"; /** * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier @@ -157,71 +190,160 @@ public class HDTOptionsKeys { /** * Key for enabling the profiler (if implemented), default to false. Boolean value */ + @Key(type = Key.Type.BOOLEAN, desc = "Use the profiler to get the time of each section") public static final String PROFILER_KEY = "profiler"; /** * Key for the profiler output (if implemented). File value */ + @Key(type = Key.Type.PATH, desc = "Profiler output file") public static final String PROFILER_OUTPUT_KEY = "profiler.output"; /** * Key for enabling the canonical NTriple file simple parser, default to false. Boolean value */ + @Key(type = Key.Type.BOOLEAN, desc = "Use the canonical NT file parser, removing checks") public static final String NT_SIMPLE_PARSER_KEY = "parser.ntSimpleParser"; /** * Key for setting the triple order. see {@link org.rdfhdt.hdt.enums.TripleComponentOrder}'s names to have the values * default to {@link org.rdfhdt.hdt.enums.TripleComponentOrder#SPO} */ + @Key(type = Key.Type.STRING, desc = "HDT generation triple order") public static final String TRIPLE_ORDER_KEY = "triplesOrder"; /** * Option to set how the HDTs are loaded in HDTCat/HDTDiff, default {@link #LOAD_HDT_TYPE_VALUE_MAP} */ + @Key(type = Key.Type.ENUM, desc = "loading type for HDTCat / HDTDiff") public static final String LOAD_HDT_TYPE_KEY = "loader.hdt.type"; /** * load the HDT file into memory */ + @Value(value = LOAD_HDT_TYPE_KEY, desc = "load the HDTs in memory") public static final String LOAD_HDT_TYPE_VALUE_LOAD = "load"; /** * map the HDT file, default value */ + @Value(value = LOAD_HDT_TYPE_KEY, desc = "map the HDTs") public static final String LOAD_HDT_TYPE_VALUE_MAP = "map"; /** * Implementation of the temporary dictionary */ + @Key(type = Key.Type.ENUM, desc = "Internal temporary dictionary") public static final String TEMP_DICTIONARY_IMPL_KEY = "tempDictionary.impl"; /** * use Hash map to create the HDT */ + @Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary") public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH = "hash"; /** * use Hash map to create the HDT and store the multisection dictionary, mandatory to create MSC */ + @Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary with literal count") public static final String TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH = "multHash"; /** * use Hash map with Prefix AND Suffix front-coded (PSFC), mandatory to create PSFC dictionary */ + @Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "Prefix AND Suffix front-coded (PSFC) hash dictionary") public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC = "hashPsfc"; /** * Implementation of the dictionary */ + @Key(type = Key.Type.ENUM, desc = "HDT dictionary type") public static final String DICTIONARY_TYPE_KEY = "dictionary.type"; /** * 4 Section dictionary */ + @Value(value = DICTIONARY_TYPE_KEY, desc = "Four sectiob dictionary") public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION; /** * Prefix AND Suffix front-coded (PSFC) 4 Section dictionary */ + @Value(value = DICTIONARY_TYPE_KEY, desc = "Prefix AND Suffix front-coded (PSFC) four section dictionary") public static final String DICTIONARY_TYPE_VALUE_FOUR_PSFC_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION; /** * big 4 Section dictionary */ - public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG ="dictionaryFourBig"; + @Value(value = DICTIONARY_TYPE_KEY, desc = "Four section dictionary big") + public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG = "dictionaryFourBig"; /** * multi section dictionary */ + @Value(value = DICTIONARY_TYPE_KEY, desc = "Multi section dictionary") public static final String DICTIONARY_TYPE_VALUE_MULTI_OBJECTS = "dictionaryMultiObj"; - private HDTOptionsKeys() {} + // use tree-map to have a better order + private static final Map OPTION_MAP = new TreeMap<>(); + + static { + try { + for (Field f : HDTOptionsKeys.class.getDeclaredFields()) { + Key key = f.getAnnotation(Key.class); + if (key != null) { + String keyValue = (String) f.get(null); + + OPTION_MAP.put(keyValue, new Option(keyValue, key)); + } else { + Value value = f.getAnnotation(Value.class); + if (value != null) { + String valueValue = (String) f.get(null); + Option opt = OPTION_MAP.get(value.value()); + if (opt != null) { + opt.values.add(new OptionValue(valueValue, value)); + } + } + } + } + } catch (Exception e) { + throw new Error("Can't load option keys", e); + } + } + + public static Map getOptionMap() { + return Collections.unmodifiableMap(OPTION_MAP); + } + + private HDTOptionsKeys() { + } + + public static class OptionValue { + private final String value; + private final Value valueInfo; + + public OptionValue(String value, Value valueInfo) { + this.value = value; + this.valueInfo = valueInfo; + } + + public String getValue() { + return value; + } + + public Value getValueInfo() { + return valueInfo; + } + } + + public static class Option { + private final String key; + private final Key keyInfo; + private final List values = new ArrayList<>(); + + public Option(String key, Key keyInfo) { + this.key = key; + this.keyInfo = keyInfo; + } + + public String getKey() { + return key; + } + + public Key getKeyInfo() { + return keyInfo; + } + + public List getValues() { + return Collections.unmodifiableList(values); + } + } } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java new file mode 100644 index 00000000..9026c7f7 --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java @@ -0,0 +1,25 @@ +package org.rdfhdt.hdt.options; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface Key { + enum Type { + STRING("String"), PATH("Path"), NUMBER("Number"), DOUBLE("Double"), BOOLEAN("Boolean"), ENUM("Enum"); + + private final String title; + + Type(String title) { + this.title = title; + } + + public String getTitle() { + return title; + } + } + + String desc() default ""; + + Type type() default Type.STRING; +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java new file mode 100644 index 00000000..6294587b --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java @@ -0,0 +1,10 @@ +package org.rdfhdt.hdt.options; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface Value { + String value(); + String desc() default ""; +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java index 42552eb8..aa18ecaa 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java @@ -211,6 +211,7 @@ else if (c == '\\') { startIdx = backSlashIdx + 2; } else if (c == 'u') { + // not canonical but whatever // \\uxxxx if (backSlashIdx + 5 >= sLength) { throw new IllegalArgumentException( @@ -230,6 +231,7 @@ else if (c == 'u') { } } else if (c == 'U') { + // not canonical but whatever // \\Uxxxxxxxx if (backSlashIdx + 9 >= sLength) { throw new IllegalArgumentException( @@ -238,8 +240,10 @@ else if (c == 'U') { String xx = s.substring(backSlashIdx + 2, backSlashIdx + 10); try { - c = (char)Integer.parseInt(xx, 16); - sb.append(c); + char[] chars = Character.toChars(Integer.parseInt(xx, 16)); + for (char cc : chars) { + sb.append(cc); + } startIdx = backSlashIdx + 10; } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index 59395405..6aa335cb 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.net.URI; import java.nio.file.Path; +import java.util.Collection; import java.util.List; import org.rdfhdt.hdt.enums.CompressionType; @@ -109,6 +110,11 @@ private static long getMaxTreeCatChunkSize() { @Parameter(names = "-multithread", description = "Use multithread logger") public boolean multiThreadLog; + @Parameter(names = "-printoptions", description = "Print options") + public boolean printoptions; + @Parameter(names = "-color", description = "Print using color (if available)") + public boolean color; + private static long findBestMemoryChunkDiskMapTreeCat() { Runtime runtime = Runtime.getRuntime(); long maxRam = (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * 0.85) / 3; @@ -190,7 +196,7 @@ public void execute() throws ParserException, IOException { System.out.println("Using temp directory " + diskLocation); } } - MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole() : null; + MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole(color) : null; hdt = HDTManager.catTree( RDFFluxStop.countLimit(findBestMemoryChunkDiskMapTreeCat()), HDTSupplier.disk(), @@ -225,14 +231,14 @@ public void execute() throws ParserException, IOException { System.out.println("Using temp directory " + diskLocation); } } - MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole() : null; + MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole(color) : null; hdt = HDTManager.generateHDTDisk(rdfInput, baseURI, notation, CompressionType.guess(rdfInput), spec, listenerConsole); if (listenerConsole != null) { listenerConsole.notifyProgress(100, "done"); } } else { ProgressListener listenerConsole = - !quiet ? (multiThreadLog ? new MultiThreadListenerConsole() : this) + !quiet ? (multiThreadLog ? new MultiThreadListenerConsole(color) : this) : null; hdt = HDTManager.generateHDT(rdfInput, baseURI, notation, spec, listenerConsole); } @@ -279,12 +285,58 @@ public void notifyProgress(float level, String message) { } } + private String color(int r, int g, int b) { + if (!color) { + return ""; + } + int color = 16 + 36*r + 6 * g + b; + return "\033[38;5;"+color+"m"; + } + + private String colorReset() { + return color ? "\033[0m" : ""; + } + @SuppressWarnings("deprecation") public static void main(String[] args) throws Throwable { RDF2HDT rdf2hdt = new RDF2HDT(); JCommander com = new JCommander(rdf2hdt, args); com.setProgramName("rdf2hdt"); - + + if (rdf2hdt.printoptions) { + Collection values = HDTOptionsKeys.getOptionMap().values(); + + for (HDTOptionsKeys.Option opt : values) { + System.out.println(rdf2hdt.color(3, 1, 5) + "Key: " + rdf2hdt.color(5, 1, 0) + opt.getKey()); + if (!opt.getKeyInfo().desc().isEmpty()) { + System.out.println(rdf2hdt.color(3, 1, 5) + "Desc: " + rdf2hdt.colorReset() + opt.getKeyInfo().desc()); + } + System.out.println(rdf2hdt.color(3, 1, 5) + "Type: " + rdf2hdt.colorReset() + opt.getKeyInfo().type().getTitle()); + switch (opt.getKeyInfo().type()) { + case BOOLEAN: + System.out.println(rdf2hdt.color(3, 1, 5) + "Possible value: " + rdf2hdt.colorReset() + "true|false"); + break; + case ENUM: + System.out.println(rdf2hdt.color(3, 1, 5) + "Possible value:"); + int max = opt.getValues().stream().mapToInt(vle -> vle.getValue().length()).max().orElse(0); + for (HDTOptionsKeys.OptionValue vle : opt.getValues()) { + System.out.print(rdf2hdt.color(3, 3, 3) + "- " + rdf2hdt.colorReset() + vle.getValue()); + if (!vle.getValueInfo().desc().isEmpty()) { + System.out.println(rdf2hdt.color(3, 3, 3) + " ".repeat(max - vle.getValue().length()) + " : " + vle.getValueInfo().desc()); + } else { + System.out.println(); + } + } + break; + default: + break; + } + System.out.println("\n"); + } + + return; + } + if(rdf2hdt.parameters.size()==1) { System.err.println("No input file specified, reading from standard input."); rdf2hdt.rdfInput = "-"; diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java index 915b1e8f..dca6d866 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java @@ -29,13 +29,15 @@ private static String goBackNLine(int line) { } private final Map threadMessages; + private final boolean color; private int previous; - public MultiThreadListenerConsole() { - this(ALLOW_ASCII_SEQUENCE); + public MultiThreadListenerConsole(boolean color) { + this(color, ALLOW_ASCII_SEQUENCE); } - public MultiThreadListenerConsole(boolean asciiListener) { + public MultiThreadListenerConsole(boolean color, boolean asciiListener) { + this.color = color; if (asciiListener) { threadMessages = new TreeMap<>(); } else { @@ -43,6 +45,26 @@ public MultiThreadListenerConsole(boolean asciiListener) { } } + public String color(int r, int g, int b) { + if (!color) { + return ""; + } + int color = 16 + 36 * r + 6 * g + b; + return "\033[38;5;" + color + "m"; + } + + public String colorReset() { + return color ? "\033[0m" : ""; + } + + public String colorThread() { + return color(5, 1, 1); + } + + public String colorPercentage() { + return color(1, 1, 5); + } + @Override public synchronized void unregisterAllThreads() { if (threadMessages == null) { @@ -68,12 +90,12 @@ public synchronized void unregisterThread(String threadName) { @Override public synchronized void notifyProgress(String thread, float level, String message) { - String msg = "[" + level + "] " + message; + String msg = colorPercentage() + "[" + level + "] " + colorReset() + message; if (threadMessages != null) { threadMessages.put(thread, msg); render(); } else { - System.out.println("[" + thread + "]" + msg); + System.out.println(colorThread() + "[" + thread + "]" + colorReset() + msg); } } @@ -89,13 +111,14 @@ private void render() { message.append(goBackNLine(previous)); } // write each thread logs - threadMessages.forEach((thread, msg) -> { - message.append(ERASE_LINE).append("[").append(thread).append("]").append(msg).append("\n"); - }); + threadMessages.forEach((thread, msg) -> message + .append(ERASE_LINE) + .append(colorThread()).append("[").append(thread).append("]") + .append(msg).append("\n")); // remove previous printing int toRemove = previous - lines; if (toRemove > 0) { - message.append((ERASE_LINE+"\n").repeat(toRemove)).append(goBackNLine(toRemove)); + message.append((ERASE_LINE + "\n").repeat(toRemove)).append(goBackNLine(toRemove)); } previous = lines; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 355fcf67..7b75763b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -451,7 +451,7 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< // if a future HDT location has been asked, move to it and map the HDT if (futureHDTLocation != null) { - Files.createDirectories(futureHDTLocation.getParent()); + Files.createDirectories(futureHDTLocation.toAbsolutePath().getParent()); Files.deleteIfExists(futureHDTLocation); Files.move(hdtFile, futureHDTLocation); return HDTManager.mapHDT(futureHDTLocation.toAbsolutePath().toString()); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java index aeca8775..1ca37c31 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java @@ -1,6 +1,7 @@ package org.rdfhdt.hdt.util.string; public interface ByteString extends CharSequence, Comparable { + int UTF8_BIG = 2; static ByteString of(CharSequence sec) { return ByteStringUtil.asByteString(sec); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java index cd430cc7..139c587a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java @@ -8,15 +8,19 @@ import java.util.function.Consumer; public class DebugOrderNodeIterator implements Consumer { - public static boolean isAssertEnable() { + private static boolean assertEnabled; + static { try { assert false; - return false; } catch (AssertionError e) { - return true; + assertEnabled = true; } } + public static boolean isAssertEnable() { + return assertEnabled; + } + public static Consumer of(String name) { return of(name, false); } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java index a1f4ff49..600a9e4b 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -844,6 +844,21 @@ public void diffMultiSectTest() throws ParserException, IOException, NotFoundExc @Ignore("handTests") public static class HandTest extends HDTManagerTestBase { + @Test + public void qzdqzdTest() throws ParserException, IOException { + String path = "/Users/ate/workspace/qacompany/hdt-java-ate47/hdt-java-package/target/hdt-java-package-3.0.5-distribution/hdt-java-package-3.0.5/bin/shit.nt.gz"; + + HDTSpecification spec = new HDTSpecification(); + spec.load("/Users/ate/workspace/qacompany/hdt-java-ate47/hdt-java-package/target/hdt-java-package-3.0.5-distribution/hdt-java-package-3.0.5/bin/option.hdtspec"); + + + try (HDT hdt = HDTManager.generateHDTDisk(path, "http://ex.ogr/#", spec, + (level, message) -> System.out.println("[" + level + "] " + message) + )) { + System.out.println(hdt.getTriples().getNumberOfElements()); + } + + } @Test public void bigDiskTest() throws ParserException, IOException { LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/UnicodeEscapeTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/UnicodeEscapeTest.java new file mode 100644 index 00000000..927e84a5 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/UnicodeEscapeTest.java @@ -0,0 +1,65 @@ +package org.rdfhdt.hdt.util; + +import org.junit.Test; +import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.rdf.RDFParserCallback; +import org.rdfhdt.hdt.rdf.RDFParserFactory; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; + +import java.io.IOException; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; + +import static org.junit.Assert.assertEquals; + +public class UnicodeEscapeTest { + @Test + public void encodeTest() throws ParserException { + String file = Objects.requireNonNull(UnicodeEscapeTest.class.getClassLoader().getResource("unicodeTest.nt"), "can't find file").getFile(); + + RDFParserCallback factory = RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES, true); + RDFParserCallback factory2 = RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES, false); + + + Set ts1 = new TreeSet<>(Comparator.comparing(t -> { + try { + return t.asNtriple().toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + })); + Set ts2 = new TreeSet<>(Comparator.comparing(t -> { + try { + return t.asNtriple().toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + })); + factory.doParse(file, HDTTestUtils.BASE_URI, RDFNotation.NTRIPLES, true, (t, i) -> ts1.add(t.tripleToString())); + factory2.doParse(file, HDTTestUtils.BASE_URI, RDFNotation.NTRIPLES, true, (t, i) -> ts2.add(t.tripleToString())); + + Iterator it1 = ts1.iterator(); + Iterator it2 = ts2.iterator(); + + HDTTestUtils.CoIterator it = new HDTTestUtils.CoIterator<>(it1, it2); + + while (it.hasNext()) { + HDTTestUtils.Tuple e = it.next(); + System.out.println(e); + assertEquals(e.t1, e.t2); + } + } + @Test + public void decodeTest() { + assertEquals( + "\uD877\uDD76", + UnicodeEscape.unescapeString("\\U0002dd76") + ); + + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/ByteStringTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/ByteStringTest.java new file mode 100644 index 00000000..9001c048 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/ByteStringTest.java @@ -0,0 +1,48 @@ +package org.rdfhdt.hdt.util.string; + +import org.junit.Test; + +import java.text.Collator; + +public class ByteStringTest { + private static void printHex(byte[] b) { + for (byte bb : b) { + System.out.printf("%2x ", bb); + } + System.out.println(); + } + private static void printBin(byte[] b) { + for (byte bb : b) { + String s = Integer.toBinaryString(bb & 0xFF); + System.out.print("0".repeat(8 - s.length()) + s + " "); + } + System.out.println(); + } + @Test + public void utf32Test() { + String ss1 = "\uD85B\uDCE3"; + String ss2 = "\uF4D1"; + + ByteString b1 = ByteString.of(ss1); + ByteString b2 = ByteString.of(ss2); + + assert ss1.equals(b1.toString()); + assert ss2.equals(b2.toString()); + + Collator coll = Collator.getInstance(); + + System.out.println("BYTESTRING: " + b1 + (b1.compareTo(b2) < 0 ? " < " : " > ") + b2); + System.out.println("STRING : " + b1 + (b1.toString().compareTo(b2.toString()) < 0 ? " < " : " > ") + b2); + System.out.println("COLLATOR : " + b1 + (coll.compare(b1.toString(), b2.toString()) < 0 ? " < " : " > ") + b2); + + printHex(b1.getBuffer()); + printHex(b2.getBuffer()); + + printBin(b1.getBuffer()); + printBin(b2.getBuffer()); + + System.out.println(Character.isHighSurrogate(ss1.charAt(0)) + ", " + Character.isLowSurrogate(ss1.charAt(1))); + System.out.println(Character.toCodePoint(ss1.charAt(0), ss1.charAt(1))); + System.out.println((int) ss2.charAt(0)); + } +} diff --git a/hdt-java-core/src/test/resources/unicodeTest.nt b/hdt-java-core/src/test/resources/unicodeTest.nt new file mode 100644 index 00000000..1d0deb79 --- /dev/null +++ b/hdt-java-core/src/test/resources/unicodeTest.nt @@ -0,0 +1,506 @@ + "\u5967\u53E4\u65AF\u4E01"@zh-hant . + "\u5967\u53E4\u65AF\u4E01"@zh-hant . + "\u5967\u53E4\u65AF\u4E01"@zh-hant . + "\u963F\u4EC0\u9813\u99AC\u4E01Vantage"@zh-hant . + "\u963F\u4EC0\u9813\u99AC\u4E01Vantage"@zh-hant . + "\u963F\u4EC0\u9813\u99AC\u4E01Vantage"@zh-hant . + "\u963F\u4EC0\u9813\u99AC\u4E01Vantage2018"@zh-hant . + "\u4E01\u6642\u767C"@zh-hant . + "\u4E01\u6642\u767C"@zh-hant . + "\u4E01\u6642\u767C"@zh-hant . + "\u4E01B"@zh-hant . + "\u4E01B"@zh-hant . + "\u4E01B"@zh-hant . + "\u4E01\u73E4\u6210"@zh-hant . + "\u4E01\u73E4\u6210"@zh-hant . + "\u4E01\u73E4\u6210"@zh-hant . + "\u4E01\u5357\u53DF"@zh-hant . + "\u4E01\u5357\u53DF"@zh-hant . + "\u4E01\u5357\u53DF"@zh-hant . + "\u4E01\u58FD\u8001"@zh-hant . + "\u4E01\u58FD\u8001"@zh-hant . + "\u4E01\u58FD\u8001"@zh-hant . + "\u4E01\u96C4\u975E"@zh-hant . + "\u4E01\u96C4\u975E"@zh-hant . + "\u4E01\u96C4\u975E"@zh-hant . + "\u4E01\u6E90"@zh-hant . + "\u4E01\u6E90"@zh-hant . + "\u4E01\u6E90"@zh-hant . + "\u4E01\u9865"@zh-hant . + "\u4E01\u9865"@zh-hant . + "\u4E01\u9865"@zh-hant . + "\u4E01\u5E0C\u4EAE"@zh-hant . + "\u4E01\u5E0C\u4EAE"@zh-hant . + "\u4E01\u5E0C\u4EAE"@zh-hant . + "\u4E01\u54B8"@zh-hant . + "\u4E01\u54B8"@zh-hant . + "\u4E01\u54B8"@zh-hant . + "\u4E01\u55E3\u5EF6"@zh-hant . + "\u4E01\u55E3\u5EF6"@zh-hant . + "\u4E01\u55E3\u5EF6"@zh-hant . + "\u4E01\u58EB\u905C"@zh-hant . + "\u4E01\u58EB\u905C"@zh-hant . + "\u4E01\u58EB\u905C"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u8077\u99AC\u797F\u4E01"@zh-hant . + "\u8077\u99AC\u797F\u4E01"@zh-hant . + "\u8077\u99AC\u797F\u4E01"@zh-hant . + "\u4E01\u5043"@zh-hant . + "\u4E01\u5043"@zh-hant . + "\u4E01\u5043"@zh-hant . + "\u4E01\u6FE4"@zh-hant . + "\u4E01\u6FE4"@zh-hant . + "\u4E01\u6FE4"@zh-hant . + "\u4E01\u57F7\u79AE"@zh-hant . + "\u4E01\u57F7\u79AE"@zh-hant . + "\u4E01\u57F7\u79AE"@zh-hant . + "\u4E01\u5C11\u5FAE"@zh-hant . + "\u4E01\u5C11\u5FAE"@zh-hant . + "\u4E01\u5C11\u5FAE"@zh-hant . + "\u4E01\u4FE0"@zh-hant . + "\u4E01\u4FE0"@zh-hant . + "\u4E01\u4FE0"@zh-hant . + "\u4E01\u5B88\u5FB7"@zh-hant . + "\u4E01\u5B88\u5FB7"@zh-hant . + "\u4E01\u5B88\u5FB7"@zh-hant . + "\u4E01\u77F3"@zh-hant . + "\u4E01\u77F3"@zh-hant . + "\u4E01\u77F3"@zh-hant . + "\u4E01\u61C9\u594E"@zh-hant . + "\u4E01\u61C9\u594E"@zh-hant . + "\u4E01\u61C9\u594E"@zh-hant . + "\u4E01\u826F\u537F"@zh-hant . + "\u4E01\u826F\u537F"@zh-hant . + "\u4E01\u826F\u537F"@zh-hant . + "\u4E01\u5143\u8F14"@zh-hant . + "\u4E01\u5143\u8F14"@zh-hant . + "\u4E01\u5143\u8F14"@zh-hant . + "\u4E01\u5143\u82F1"@zh-hant . + "\u4E01\u5143\u82F1"@zh-hant . + "\u4E01\u5143\u82F1"@zh-hant . + "\u4E01\u61C9\u5BA6"@zh-hant . + "\u4E01\u61C9\u5BA6"@zh-hant . + "\u4E01\u61C9\u5BA6"@zh-hant . + "\u4E01\u61C9\u5BF5"@zh-hant . + "\u4E01\u61C9\u5BF5"@zh-hant . + "\u4E01\u61C9\u5BF5"@zh-hant . + "\u4E01\u9326\u5B65"@zh-hant . + "\u4E01\u9326\u5B65"@zh-hant . + "\u4E01\u9326\u5B65"@zh-hant . + "\u4E01\u9580\u96C5"@zh-hant . + "\u4E01\u9580\u96C5"@zh-hant . + "\u4E01\u9580\u96C5"@zh-hant . + "\u963F\u8001\u74E6\u4E01"@zh-hant . + "\u963F\u8001\u74E6\u4E01"@zh-hant . + "\u963F\u8001\u74E6\u4E01"@zh-hant . + "\u4E01\u5143\u4EA8"@zh-hant . + "\u4E01\u5143\u4EA8"@zh-hant . + "\u4E01\u5143\u4EA8"@zh-hant . + "\u4E01\u6D2A\u653F"@zh-hant . + "\u4E01\u6D2A\u653F"@zh-hant . + "\u4E01\u6D2A\u653F"@zh-hant . + "\u4E01\u7FA9\u5B5A"@zh-hant . + "\u4E01\u7FA9\u5B5A"@zh-hant . + "\u4E01\u7FA9\u5B5A"@zh-hant . + "\u4E01\u5168\u67D0"@zh-hant . + "\u4E01\u5168\u67D0"@zh-hant . + "\u4E01\u5168\u67D0"@zh-hant . + "\u4E01\u5B50\u8CB4"@zh-hant . + "\u4E01\u5B50\u8CB4"@zh-hant . + "\u4E01\u5B50\u8CB4"@zh-hant . + "\u4E01\u5FB7\u7D39"@zh-hant . + "\u4E01\u5FB7\u7D39"@zh-hant . + "\u4E01\u5FB7\u7D39"@zh-hant . + "\u4E01\u8AA0"@zh-hant . + "\u4E01\u8AA0"@zh-hant . + "\u4E01\u8AA0"@zh-hant . + "\u4E01\u4EF2\u980A"@zh-hant . + "\u4E01\u4EF2\u980A"@zh-hant . + "\u4E01\u4EF2\u980A"@zh-hant . + "\u4E01\u6301\u656C"@zh-hant . + "\u4E01\u6301\u656C"@zh-hant . + "\u4E01\u6301\u656C"@zh-hant . + "\u4E01\u6FDB"@zh-hant . + "\u4E01\u6FDB"@zh-hant . + "\u4E01\u6FDB"@zh-hant . + "\u4E0D\u9B6F\u7F55\u4E01"@zh-hant . + "\u4E0D\u9B6F\u7F55\u4E01"@zh-hant . + "\u4E0D\u9B6F\u7F55\u4E01"@zh-hant . + "\u4E01\u6C76"@zh-hant . + "\u4E01\u6C76"@zh-hant . + "\u4E01\u6C76"@zh-hant . + "\u4E01\u8AAA"@zh-hant . + "\u4E01\u8AAA"@zh-hant . + "\u4E01\u8AAA"@zh-hant . + "\u4E01\u7430"@zh-hant . + "\u4E01\u7430"@zh-hant . + "\u4E01\u7430"@zh-hant . + "\u4E01\u93DE"@zh-hant . + "\u4E01\u93DE"@zh-hant . + "\u4E01\u93DE"@zh-hant . + "\u4E01\u79C0"@zh-hant . + "\u4E01\u79C0"@zh-hant . + "\u4E01\u79C0"@zh-hant . + "\u4E01\u742A"@zh-hant . + "\u4E01\u742A"@zh-hant . + "\u4E01\u742A"@zh-hant . + "\u4E01\u73CF"@zh-hant . + "\u4E01\u73CF"@zh-hant . + "\u4E01\u73CF"@zh-hant . + "\u4E01\u7489"@zh-hant . + "\u4E01\u7489"@zh-hant . + "\u4E01\u7489"@zh-hant . + "\u4E01\u6C5D\u76F8"@zh-hant . + "\u4E01\u6C5D\u76F8"@zh-hant . + "\u4E01\u6C5D\u76F8"@zh-hant . + "\u4E01\u61C9\u8FB0"@zh-hant . + "\u4E01\u61C9\u8FB0"@zh-hant . + "\u4E01\u61C9\u8FB0"@zh-hant . + "\u5409\u96C5\u8B28\u4E01"@zh-hant . + "\u5409\u96C5\u8B28\u4E01"@zh-hant . + "\u5409\u96C5\u8B28\u4E01"@zh-hant . + "\u4E01\u7965"@zh-hant . + "\u4E01\u7965"@zh-hant . + "\u4E01\u7965"@zh-hant . + "\u4E01\u5B54\u77B3"@zh-hant . + "\u4E01\u5B54\u77B3"@zh-hant . + "\u4E01\u5B54\u77B3"@zh-hant . + "\u4EA6\u99AC\u90FD\u4E01"@zh-hant . + "\u4EA6\u99AC\u90FD\u4E01"@zh-hant . + "\u4EA6\u99AC\u90FD\u4E01"@zh-hant . + "\u4E01\u5F64"@zh-hant . + "\u4E01\u5F64"@zh-hant . + "\u4E01\u5F64"@zh-hant . + "\u672D\u524C\u9B6F\u4E01"@zh-hant . + "\u672D\u524C\u9B6F\u4E01"@zh-hant . + "\u672D\u524C\u9B6F\u4E01"@zh-hant . + "\u54F2\u99AC\u9B6F\u4E01"@zh-hant . + "\u54F2\u99AC\u9B6F\u4E01"@zh-hant . + "\u54F2\u99AC\u9B6F\u4E01"@zh-hant . + "\u4E01\u7C20"@zh-hant . + "\u4E01\u7C20"@zh-hant . + "\u4E01\u7C20"@zh-hant . + "\u4E01\u667A"@zh-hant . + "\u4E01\u667A"@zh-hant . + "\u4E01\u667A"@zh-hant . + "\u4E01\u525B"@zh-hant . + "\u4E01\u525B"@zh-hant . + "\u4E01\u525B"@zh-hant . + "\u4E01\u61CC"@zh-hant . + "\u4E01\u61CC"@zh-hant . + "\u4E01\u61CC"@zh-hant . + "\u4E01\u826F\u537F"@zh-hant . + "\u4E01\u826F\u537F"@zh-hant . + "\u4E01\u826F\u537F"@zh-hant . + "\u602F\u91CC\u6728\u4E01"@zh-hant . + "\u602F\u91CC\u6728\u4E01"@zh-hant . + "\u602F\u91CC\u6728\u4E01"@zh-hant . + "\u4E01\u5143\u5FA9"@zh-hant . + "\u4E01\u5143\u5FA9"@zh-hant . + "\u4E01\u5143\u5FA9"@zh-hant . + "\u4E01\u65E6"@zh-hant . + "\u4E01\u65E6"@zh-hant . + "\u4E01\u65E6"@zh-hant . + "\u4E01\u6C5D\u8B19"@zh-hant . + "\u4E01\u6C5D\u8B19"@zh-hant . + "\u4E01\u6C5D\u8B19"@zh-hant . + "\u4E01\u73CF"@zh-hant . + "\u4E01\u73CF"@zh-hant . + "\u4E01\u73CF"@zh-hant . + "\u4E01\u4F51"@zh-hant . + "\u4E01\u4F51"@zh-hant . + "\u4E01\u4F51"@zh-hant . + "\u4E01\u93A1"@zh-hant . + "\u4E01\u93A1"@zh-hant . + "\u4E01\u93A1"@zh-hant . + "\u4E01\u61F2"@zh-hant . + "\u4E01\u61F2"@zh-hant . + "\u4E01\u61F2"@zh-hant . + "\u53EF\u99AC\u524C\u4E01"@zh-hant . + "\u53EF\u99AC\u524C\u4E01"@zh-hant . + "\u53EF\u99AC\u524C\u4E01"@zh-hant . + "\u4E01\u5FD7\u5FB7"@zh-hant . + "\u4E01\u5FD7\u5FB7"@zh-hant . + "\u4E01\u5FD7\u5FB7"@zh-hant . + "\u4E01\u8B39"@zh-hant . + "\u4E01\u8B39"@zh-hant . + "\u4E01\u8B39"@zh-hant . + "\u4E01\u6B64\u53EC"@zh-hant . + "\u4E01\u6B64\u53EC"@zh-hant . + "\u4E01\u6B64\u53EC"@zh-hant . + "\u4E01\u79AE"@zh-hant . + "\u4E01\u79AE"@zh-hant . + "\u4E01\u79AE"@zh-hant . + "\u4E01\U0002DD76"@zh-hant . + "\u4E01\U0002DD76"@zh-hant . + "\u4E01\U0002DD76"@zh-hant . + "\u4E01\u714C"@zh-hant . + "\u4E01\u714C"@zh-hant . + "\u4E01\u714C"@zh-hant . + "\u4E01\u7DD2"@zh-hant . + "\u4E01\u7DD2"@zh-hant . + "\u4E01\u7DD2"@zh-hant . + "\u4E01\u5C45\u67D0"@zh-hant . + "\u4E01\u5C45\u67D0"@zh-hant . + "\u4E01\u5C45\u67D0"@zh-hant . + "\u4E01\u5DE8\u7ACB"@zh-hant . + "\u4E01\u5DE8\u7ACB"@zh-hant . + "\u4E01\u5DE8\u7ACB"@zh-hant . + "\u4E01\u58FD"@zh-hant . + "\u4E01\u58FD"@zh-hant . + "\u4E01\u58FD"@zh-hant . + "\u4E01\u6B3D"@zh-hant . + "\u4E01\u6B3D"@zh-hant . + "\u4E01\u6B3D"@zh-hant . + "\u4E01\u662D\u67D0"@zh-hant . + "\u4E01\u662D\u67D0"@zh-hant . + "\u4E01\u662D\u67D0"@zh-hant . + "\u7D0D\u901F\u524C\u4E01"@zh-hant . + "\u7D0D\u901F\u524C\u4E01"@zh-hant . + "\u7D0D\u901F\u524C\u4E01"@zh-hant . + "\u674E\u4E01"@zh-hant . + "\u674E\u4E01"@zh-hant . + "\u674E\u4E01"@zh-hant . + "\u4E01\u6D0C"@zh-hant . + "\u4E01\u6D0C"@zh-hant . + "\u4E01\u6D0C"@zh-hant . + "\u4E01\u6F7E"@zh-hant . + "\u4E01\u6F7E"@zh-hant . + "\u4E01\u6F7E"@zh-hant . + "\u54C8\u8FF7\u4E01"@zh-hant . + "\u54C8\u8FF7\u4E01"@zh-hant . + "\u54C8\u8FF7\u4E01"@zh-hant . + "\u4E01\u5831\u73E0"@zh-hant . + "\u4E01\u5831\u73E0"@zh-hant . + "\u4E01\u5831\u73E0"@zh-hant . + "\u4E01\u5831\u73E0"@zh-hant . + "\u856D\u8207\u4E01"@zh-hant . + "\u856D\u8207\u4E01"@zh-hant . + "\u856D\u8207\u4E01"@zh-hant . + "\u4E01\u58EB"@zh-hant . + "\u4E01\u58EB"@zh-hant . + "\u4E01\u58EB"@zh-hant . + "\u4E01\u7BC4"@zh-hant . + "\u4E01\u7BC4"@zh-hant . + "\u4E01\u7BC4"@zh-hant . + "\u4E01\u5F65"@zh-hant . + "\u4E01\u5F65"@zh-hant . + "\u4E01\u5F65"@zh-hant . + "\u4E01\u6587\u91D7"@zh-hant . + "\u4E01\u6587\u91D7"@zh-hant . + "\u4E01\u6587\u91D7"@zh-hant . + "\u4E01\u658C"@zh-hant . + "\u4E01\u658C"@zh-hant . + "\u4E01\u658C"@zh-hant . + "\u4E01\u6B63\u6587"@zh-hant . + "\u4E01\u6B63\u6587"@zh-hant . + "\u4E01\u6B63\u6587"@zh-hant . + "\u4E01\u7389\u71FE"@zh-hant . + "\u4E01\u7389\u71FE"@zh-hant . + "\u4E01\u7389\u71FE"@zh-hant . + "\u4E01\u5143\u5FA9"@zh-hant . + "\u4E01\u5143\u5FA9"@zh-hant . + "\u4E01\u5143\u5FA9"@zh-hant . + "\u4E01\u5929\u6BD3"@zh-hant . + "\u4E01\u5929\u6BD3"@zh-hant . + "\u4E01\u5929\u6BD3"@zh-hant . + "\u4E01\u5EFA\u696D"@zh-hant . + "\u4E01\u5EFA\u696D"@zh-hant . + "\u4E01\u5EFA\u696D"@zh-hant . + "\u4E01\u6B65\u66FE"@zh-hant . + "\u4E01\u6B65\u66FE"@zh-hant . + "\u4E01\u6B65\u66FE"@zh-hant . + "\u4E01\u5114\u5D69"@zh-hant . + "\u4E01\u5114\u5D69"@zh-hant . + "\u4E01\u5114\u5D69"@zh-hant . + "\u4E01\u7E8C\u66FE"@zh-hant . + "\u4E01\u7E8C\u66FE"@zh-hant . + "\u4E01\u7E8C\u66FE"@zh-hant . + "\u4E01\u5146\u797A"@zh-hant . + "\u4E01\u5146\u797A"@zh-hant . + "\u4E01\u5146\u797A"@zh-hant . + "\u4E01\u5146\u68CB"@zh-hant . + "\u6CD5\u54C8\u524C\u4E01"@zh-hant . + "\u6CD5\u54C8\u524C\u4E01"@zh-hant . + "\u6CD5\u54C8\u524C\u4E01"@zh-hant . + "\u4E01\u5F0F\u73AB"@zh-hant . + "\u4E01\u5F0F\u73AB"@zh-hant . + "\u4E01\u5F0F\u73AB"@zh-hant . + "\u4E01\u8291\u8A52"@zh-hant . + "\u4E01\u8291\u8A52"@zh-hant . + "\u4E01\u8291\u8A52"@zh-hant . + "\u4E01\u6850"@zh-hant . + "\u4E01\u6850"@zh-hant . + "\u4E01\u6850"@zh-hant . + "\u4E01\u8CB4"@zh-hant . + "\u4E01\u8CB4"@zh-hant . + "\u4E01\u8CB4"@zh-hant . + "\u4E01\u66F0\u606D"@zh-hant . + "\u4E01\u66F0\u606D"@zh-hant . + "\u4E01\u66F0\u606D"@zh-hant . + "\u4E01\u744B"@zh-hant . + "\u4E01\u744B"@zh-hant . + "\u4E01\u744B"@zh-hant . + "\u4E01\u74D8"@zh-hant . + "\u4E01\u74D8"@zh-hant . + "\u4E01\u74D8"@zh-hant . + "\u4E01\u601D\u986F"@zh-hant . + "\u4E01\u601D\u986F"@zh-hant . + "\u4E01\u601D\u986F"@zh-hant . + "\u4E01\u6690"@zh-hant . + "\u4E01\u6690"@zh-hant . + "\u4E01\u6690"@zh-hant . + "\u4E01\u9CF3\u5E74"@zh-hant . + "\u4E01\u9CF3\u5E74"@zh-hant . + "\u4E01\u9CF3\u5E74"@zh-hant . + "\u4E01\u61C9\u4EA8"@zh-hant . + "\u4E01\u61C9\u4EA8"@zh-hant . + "\u4E01\u61C9\u4EA8"@zh-hant . + "\u4E01\u93A7"@zh-hant . + "\u4E01\u93A7"@zh-hant . + "\u4E01\u93A7"@zh-hant . + "\u4E01\u7D39\u5949"@zh-hant . + "\u4E01\u7D39\u5949"@zh-hant . + "\u4E01\u7D39\u5949"@zh-hant . + "\u5FFD\u90FD\u4E0D\u4E01"@zh-hant . + "\u5FFD\u90FD\u4E0D\u4E01"@zh-hant . + "\u5FFD\u90FD\u4E0D\u4E01"@zh-hant . + "\u5FFD\u90FD\u4E0D\u4E01"@zh-hant . + "\u5FFD\u90FD\u4E0D\u4E01"@zh-hant . + "\u5FFD\u90FD\u4E0D\u4E01"@zh-hant . + "\u4E01\u6CD5\u5F3A"@zh-hant . + "\u4E01\u6CD5\u5F3A"@zh-hant . + "\u4E01\u6CD5\u5F3A"@zh-hant . + "\u4E01\u5BCC\u83EF"@zh-hant . + "\u4E01\u5BCC\u83EF"@zh-hant . + "\u4E01\u5BCC\u83EF"@zh-hant . + "\u4E01\u7152"@zh-hant . + "\u4E01\u7152"@zh-hant . + "\u4E01\u7152"@zh-hant . + "\u4E01\u92B3\u7FA9"@zh-hant . + "\u4E01\u92B3\u7FA9"@zh-hant . + "\u4E01\u92B3\u7FA9"@zh-hant . + "\u4E01\u9806"@zh-hant . + "\u4E01\u9806"@zh-hant . + "\u4E01\u9806"@zh-hant . + "\u4E01\u6C82"@zh-hant . + "\u4E01\u6C82"@zh-hant . + "\u4E01\u6C82"@zh-hant . + "\u4E01\u67D4\u7ACB"@zh-hant . + "\u4E01\u67D4\u7ACB"@zh-hant . + "\u4E01\u67D4\u7ACB"@zh-hant . + "\u4E01\u8208"@zh-hant . + "\u4E01\u8208"@zh-hant . + "\u4E01\u8208"@zh-hant . + "\u4E01\u931E"@zh-hant . + "\u4E01\u931E"@zh-hant . + "\u4E01\u931E"@zh-hant . + "\u8CFD\u798F\u4E01"@zh-hant . + "\u8CFD\u798F\u4E01"@zh-hant . + "\u8CFD\u798F\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u82EB\u601D\u4E01"@zh-hant . + "\u4E01\u8087\u8056"@zh-hant . + "\u4E01\u8087\u8056"@zh-hant . + "\u4E01\u8087\u8056"@zh-hant . + "\u4E01\u8056\u8087"@zh-hant . + "\u4E01\u5100"@zh-hant . + "\u4E01\u5100"@zh-hant . + "\u4E01\u5100"@zh-hant . + "\u4E01\u4EF2\u4EAC"@zh-hant . + "\u4E01\u4EF2\u4EAC"@zh-hant . + "\u4E01\u4EF2\u4EAC"@zh-hant . + "\u4E01\u6C38\u5B6B"@zh-hant . + "\u4E01\u6C38\u5B6B"@zh-hant . + "\u4E01\u6C38\u5B6B"@zh-hant . + "\u4E01\u9867\u8A00"@zh-hant . + "\u4E01\u9867\u8A00"@zh-hant . + "\u4E01\u9867\u8A00"@zh-hant . + "\u4E01\u9AD4\u5E38"@zh-hant . + "\u4E01\u9AD4\u5E38"@zh-hant . + "\u4E01\u9AD4\u5E38"@zh-hant . + "\u4E01\u7ACB\u5E79"@zh-hant . + "\u4E01\u7ACB\u5E79"@zh-hant . + "\u4E01\u7ACB\u5E79"@zh-hant . + "\u5854\u672E\u4E01"@zh-hant . + "\u5854\u672E\u4E01"@zh-hant . + "\u5854\u672E\u4E01"@zh-hant . + "\u4E01\u4E4B\u683B"@zh-hant . + "\u4E01\u4E4B\u683B"@zh-hant . + "\u4E01\u4E4B\u683B"@zh-hant . + "\u4E01\u541B\u6DD1"@zh-hant . + "\u4E01\u541B\u6DD1"@zh-hant . + "\u4E01\u541B\u6DD1"@zh-hant . + "\u5289\u4E01"@zh-hant . + "\u5289\u4E01"@zh-hant . + "\u5289\u4E01"@zh-hant . + "\u4E01\u67F7"@zh-hant . + "\u4E01\u67F7"@zh-hant . + "\u4E01\u67F7"@zh-hant . + "\u4E01\u4E4B\u7FF0"@zh-hant . + "\u4E01\u4E4B\u7FF0"@zh-hant . + "\u4E01\u4E4B\u7FF0"@zh-hant . + "\u4E01\u986F"@zh-hant . + "\u4E01\u986F"@zh-hant . + "\u4E01\u986F"@zh-hant . + "\u4E01\u7167"@zh-hant . + "\u4E01\u7167"@zh-hant . + "\u4E01\u7167"@zh-hant . + "\u4E01\u714C"@zh-hant . + "\u4E01\u714C"@zh-hant . + "\u4E01\u714C"@zh-hant . + "\u4E01\u746E"@zh-hant . + "\u4E01\u746E"@zh-hant . + "\u4E01\u746E"@zh-hant . + "\u4E01\u58EB\u826F"@zh-hant . + "\u4E01\u58EB\u826F"@zh-hant . + "\u4E01\u58EB\u826F"@zh-hant . + "\u4E01\u5143\u7167"@zh-hant . + "\u4E01\u5143\u7167"@zh-hant . + "\u4E01\u5143\u7167"@zh-hant . + "\u4E01\u6587\u91D7"@zh-hant . + "\u4E01\u6587\u91D7"@zh-hant . + "\u4E01\u6587\u91D7"@zh-hant . + "\u4E01\u7389\u85FB"@zh-hant . + "\u4E01\u7389\u85FB"@zh-hant . + "\u4E01\u7389\u85FB"@zh-hant . + "\u4E01\u6B63\u4E2D"@zh-hant . + "\u4E01\u6B63\u4E2D"@zh-hant . + "\u4E01\u6B63\u4E2D"@zh-hant . + "\u4E01\u7ACB\u4E2D"@zh-hant . + "\u4E01\u7ACB\u4E2D"@zh-hant . + "\u4E01\u7ACB\u4E2D"@zh-hant . + "\u4E01\u767E\u5DDD"@zh-hant . + "\u4E01\u767E\u5DDD"@zh-hant . + "\u4E01\u767E\u5DDD"@zh-hant . + "\u4E01\u6A39\u68E0"@zh-hant . + "\u4E01\u6A39\u68E0"@zh-hant . + "\u4E01\u6A39\u68E0"@zh-hant . + "\u4E01\u601D\u986F"@zh-hant . + "\u4E01\u601D\u986F"@zh-hant . + "\u4E01\u601D\u986F"@zh-hant . + "\u4E01\u990A\u5143"@zh-hant . + "\u4E01\u990A\u5143"@zh-hant . + "\u4E01\u990A\u5143"@zh-hant . + "\u4E01\u9E7F\u9CF4"@zh-hant . + "\u4E01\u9E7F\u9CF4"@zh-hant . + "\u4E01\u9E7F\u9CF4"@zh-hant . + "\u4E01\u932B\u795C"@zh-hant . + "\u4E01\u932B\u795C"@zh-hant . + "\u4E01\u932B\u795C"@zh-hant . + "\u4E01\u5143\u548C"@zh-hant . + "\u4E01\u5143\u548C"@zh-hant . + "\u4E01\u5143\u548C"@zh-hant . + "\u4E01\u6FA4"@zh-hant . + "\u4E01\u6FA4"@zh-hant . + "\u4E01\u6FA4"@zh-hant . + "\u4E01\u6625\u6FA4"@zh-hant . From d1c474b272e616559b73599e217413813bf53f9f Mon Sep 17 00:00:00 2001 From: qaate47 Date: Tue, 8 Nov 2022 17:06:21 +0100 Subject: [PATCH 7/9] Remove String usage of GenDisk, ignore Unicode test, add hdtVerify.bat for Windows, fix string order and better logs for rdf2hdt --- .../rdfhdt/hdt/options/HDTOptionsKeys.java | 41 +- .../main/java/org/rdfhdt/hdt/options/Key.java | 38 +- .../java/org/rdfhdt/hdt/options/Value.java | 16 +- .../java/org/rdfhdt/hdt/tools/HDTVerify.java | 126 ++++--- .../java/org/rdfhdt/hdt/tools/RDF2HDT.java | 70 ++-- .../listener/MultiThreadListenerConsole.java | 16 +- .../rdfhdt/hdt/dictionary/DictionaryCat.java | 5 +- .../rdfhdt/hdt/dictionary/DictionaryDiff.java | 3 +- .../hdt/dictionary/TempDictionarySection.java | 3 +- .../impl/FourSectionDictionaryCat.java | 118 +++--- .../impl/FourSectionDictionaryDiff.java | 51 +-- .../MultDictionaryPFCOptimizedExtractor.java | 36 +- .../impl/MultipleBaseDictionary.java | 51 ++- .../impl/MultipleSectionDictionary.java | 31 +- .../impl/MultipleSectionDictionaryBig.java | 22 +- .../impl/MultipleSectionDictionaryCat.java | 352 ++++++++++-------- .../impl/MultipleSectionDictionaryDiff.java | 99 ++--- .../impl/WriteMultipleSectionDictionary.java | 7 +- .../impl/section/HashDictionarySection.java | 17 +- .../section/OneReadDictionarySection.java | 6 +- .../impl/section/PFCDictionarySection.java | 20 +- .../impl/section/PFCDictionarySectionBig.java | 15 +- .../impl/section/PFCDictionarySectionMap.java | 39 +- .../dictionary/impl/utilCat/CatElement.java | 19 +- .../impl/utilCat/CatIntersection.java | 8 +- .../dictionary/impl/utilCat/CatMapping.java | 5 +- .../hdt/dictionary/impl/utilCat/CatUnion.java | 18 +- .../dictionary/impl/utilCat/CatWrapper.java | 11 +- .../impl/utilCat/IteratorPlusElement.java | 9 +- .../IteratorPlusElementComparator.java | 15 - .../dictionary/impl/utilCat/SectionUtil.java | 52 ++- .../dictionary/impl/utilDiff/DiffWrapper.java | 10 +- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 10 +- .../impl/diskimport/SectionCompressor.java | 14 +- .../hdt/iterator/utils/CombinedIterator.java | 91 +++++ .../iterator/utils/FileTripleIterator.java | 3 +- .../utils/NotificationExceptionIterator.java | 2 +- .../org/rdfhdt/hdt/triples/IndexedNode.java | 18 +- .../impl/BitmapTriplesIteratorCat.java | 32 +- .../impl/BitmapTriplesIteratorMapDiff.java | 14 +- .../org/rdfhdt/hdt/util/CustomIterator.java | 14 +- .../org/rdfhdt/hdt/util/LiteralsUtils.java | 39 +- .../util/io/compress/CompressNodeWriter.java | 9 +- .../hdt/util/io/compress/CompressUtil.java | 3 +- .../hdt/util/listener/PrefixListener.java | 78 +++- .../rdfhdt/hdt/util/string/ByteString.java | 66 +++- .../rdfhdt/hdt/util/string/CompactString.java | 11 +- .../hdt/util/string/ReplazableString.java | 18 +- .../hdt/utils/DebugOrderNodeIterator.java | 3 +- .../org/rdfhdt/hdt/hdt/HDTManagerTest.java | 179 +++++++-- .../rdfhdt/hdt/hdtCat/HdtCatRandomTest.java | 56 ++- .../iterator/utils/CombinedIteratorTest.java | 26 ++ ...exNodeDeltaMergeExceptionIteratorTest.java | 46 +-- .../util/LargeFakeDataSetStreamSupplier.java | 31 +- .../LargeFakeDataSetStreamSupplierTest.java | 148 ++++++-- .../rdfhdt/hdt/util/UnicodeEscapeTest.java | 2 +- .../hdt/util/io/compress/CompressTest.java | 1 + .../util/string/AssertionCharSequence.java | 24 +- .../hdt/util/string/ByteStringTest.java | 47 +-- hdt-java-package/bin/hdtVerify.bat | 5 + 60 files changed, 1464 insertions(+), 855 deletions(-) delete mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/IteratorPlusElementComparator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/CombinedIterator.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/CombinedIteratorTest.java create mode 100644 hdt-java-package/bin/hdtVerify.bat diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java index 3a41dcc3..b4119996 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java @@ -6,7 +6,6 @@ import java.lang.reflect.Field; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; @@ -28,13 +27,13 @@ public class HDTOptionsKeys { * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower * but decrease the RAM usage. default config. */ - @Value(value = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort all the file before going to the next step, slower but decrease the RAM usage. default config") + @Value(key = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort all the file before going to the next step, slower but decrease the RAM usage. default config") public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete"; /** * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next * step, faster but increase the RAM usage. */ - @Value(value = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort while reading all the file before going to the next step, faster but increase the RAM usage.") + @Value(key = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort while reading all the file before going to the next step, faster but increase the RAM usage.") public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial"; /** @@ -98,22 +97,22 @@ public class HDTOptionsKeys { /** * Value for {@link #LOADER_TYPE_KEY}, read using disk generation, reduce the RAM usage and increase disk usage */ - @Value(value = LOADER_TYPE_KEY, desc = "Using genDisk") + @Value(key = LOADER_TYPE_KEY, desc = "Using genDisk") public static final String LOADER_TYPE_VALUE_DISK = "disk"; /** * Value for {@link #LOADER_TYPE_KEY}, read using HDTCat generation, merge using HDTCat HDT, reduce the RAM usage */ - @Value(value = LOADER_TYPE_KEY, desc = "Using HDTCat") + @Value(key = LOADER_TYPE_KEY, desc = "Using HDTCat") public static final String LOADER_TYPE_VALUE_CAT = "cat"; /** * Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage */ - @Value(value = LOADER_TYPE_KEY, desc = "Using two pass algorithm") + @Value(key = LOADER_TYPE_KEY, desc = "Using two pass algorithm") public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass"; /** * Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value */ - @Value(value = LOADER_TYPE_KEY, desc = "Using one pass algorithm") + @Value(key = LOADER_TYPE_KEY, desc = "Using one pass algorithm") public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass"; /** @@ -150,12 +149,12 @@ public class HDTOptionsKeys { /** * Value for {@link #HDT_SUPPLIER_KEY}, use HDTGenDisk to create the HDT */ - @Value(value = HDT_SUPPLIER_KEY, desc = "using genDisk") + @Value(key = HDT_SUPPLIER_KEY, desc = "using genDisk") public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_DISK = "disk"; /** * Value for {@link #HDT_SUPPLIER_KEY}, use the default memory implementation to create the HDT */ - @Value(value = HDT_SUPPLIER_KEY, desc = "using gen in memory") + @Value(key = HDT_SUPPLIER_KEY, desc = "using gen in memory") public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_MEMORY = "memory"; /** * Key for the rdf flux stop type, default to the maximum memory allocated @@ -217,12 +216,12 @@ public class HDTOptionsKeys { /** * load the HDT file into memory */ - @Value(value = LOAD_HDT_TYPE_KEY, desc = "load the HDTs in memory") + @Value(key = LOAD_HDT_TYPE_KEY, desc = "load the HDTs in memory") public static final String LOAD_HDT_TYPE_VALUE_LOAD = "load"; /** * map the HDT file, default value */ - @Value(value = LOAD_HDT_TYPE_KEY, desc = "map the HDTs") + @Value(key = LOAD_HDT_TYPE_KEY, desc = "map the HDTs") public static final String LOAD_HDT_TYPE_VALUE_MAP = "map"; /** @@ -233,17 +232,17 @@ public class HDTOptionsKeys { /** * use Hash map to create the HDT */ - @Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary") + @Value(key = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary") public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH = "hash"; /** * use Hash map to create the HDT and store the multisection dictionary, mandatory to create MSC */ - @Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary with literal count") + @Value(key = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary with literal count") public static final String TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH = "multHash"; /** * use Hash map with Prefix AND Suffix front-coded (PSFC), mandatory to create PSFC dictionary */ - @Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "Prefix AND Suffix front-coded (PSFC) hash dictionary") + @Value(key = TEMP_DICTIONARY_IMPL_KEY, desc = "Prefix AND Suffix front-coded (PSFC) hash dictionary") public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC = "hashPsfc"; /** @@ -254,22 +253,22 @@ public class HDTOptionsKeys { /** * 4 Section dictionary */ - @Value(value = DICTIONARY_TYPE_KEY, desc = "Four sectiob dictionary") + @Value(key = DICTIONARY_TYPE_KEY, desc = "Four sectiob dictionary") public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION; /** * Prefix AND Suffix front-coded (PSFC) 4 Section dictionary */ - @Value(value = DICTIONARY_TYPE_KEY, desc = "Prefix AND Suffix front-coded (PSFC) four section dictionary") + @Value(key = DICTIONARY_TYPE_KEY, desc = "Prefix AND Suffix front-coded (PSFC) four section dictionary") public static final String DICTIONARY_TYPE_VALUE_FOUR_PSFC_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION; /** * big 4 Section dictionary */ - @Value(value = DICTIONARY_TYPE_KEY, desc = "Four section dictionary big") + @Value(key = DICTIONARY_TYPE_KEY, desc = "Four section dictionary big") public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG = "dictionaryFourBig"; /** * multi section dictionary */ - @Value(value = DICTIONARY_TYPE_KEY, desc = "Multi section dictionary") + @Value(key = DICTIONARY_TYPE_KEY, desc = "Multi section dictionary") public static final String DICTIONARY_TYPE_VALUE_MULTI_OBJECTS = "dictionaryMultiObj"; // use tree-map to have a better order @@ -287,7 +286,7 @@ public class HDTOptionsKeys { Value value = f.getAnnotation(Value.class); if (value != null) { String valueValue = (String) f.get(null); - Option opt = OPTION_MAP.get(value.value()); + Option opt = OPTION_MAP.get(value.key()); if (opt != null) { opt.values.add(new OptionValue(valueValue, value)); } @@ -310,7 +309,7 @@ public static class OptionValue { private final String value; private final Value valueInfo; - public OptionValue(String value, Value valueInfo) { + private OptionValue(String value, Value valueInfo) { this.value = value; this.valueInfo = valueInfo; } @@ -329,7 +328,7 @@ public static class Option { private final Key keyInfo; private final List values = new ArrayList<>(); - public Option(String key, Key keyInfo) { + private Option(String key, Key keyInfo) { this.key = key; this.keyInfo = keyInfo; } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java index 9026c7f7..03d73d38 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java @@ -3,23 +3,37 @@ import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; +/** + * define a key in the HDTOptionsKey class + * + * @author Antoine Willerval + */ @Retention(RetentionPolicy.RUNTIME) public @interface Key { - enum Type { - STRING("String"), PATH("Path"), NUMBER("Number"), DOUBLE("Double"), BOOLEAN("Boolean"), ENUM("Enum"); + /** + * Type enum for a key + */ + enum Type { + STRING("String"), PATH("Path"), NUMBER("Number"), DOUBLE("Double"), BOOLEAN("Boolean"), ENUM("Enum"); - private final String title; + private final String title; - Type(String title) { - this.title = title; - } + Type(String title) { + this.title = title; + } - public String getTitle() { - return title; - } - } + public String getTitle() { + return title; + } + } - String desc() default ""; + /** + * @return description of the key + */ + String desc() default ""; - Type type() default Type.STRING; + /** + * @return type of the key + */ + Type type() default Type.STRING; } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java index 6294587b..3c339268 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java @@ -3,8 +3,20 @@ import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; +/** + * Describe the value of a {@link Key} of type {@link Key.Type#ENUM} + * + * @author Antoine Willerval + */ @Retention(RetentionPolicy.RUNTIME) public @interface Value { - String value(); - String desc() default ""; + /** + * @return the key + */ + String key(); + + /** + * @return description of the value + */ + String desc() default ""; } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java index 15900c1c..a7cbb17c 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java @@ -1,79 +1,85 @@ package org.rdfhdt.hdt.tools; -import java.util.Comparator; -import java.util.Iterator; - import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CompactString; +import org.rdfhdt.hdt.util.string.ReplazableString; + +import java.util.Comparator; +import java.util.Iterator; public class HDTVerify { - private static final Comparator comparator = CharSequenceComparator.getInstance(); - - private HDTVerify() {} + private HDTVerify() { + } + + private static void print(byte[] arr) { + for (byte b : arr) { + System.out.printf("%02X ", b); + } + System.out.println(); + } + + private static void print(CharSequence seq) { + if (seq instanceof CompactString) { + CompactString cs1 = (CompactString) seq; + print(cs1.getData()); + } - private static void print(byte[] arr) { - for (byte b : arr) { - System.out.printf("%02X ", b); - } - System.out.println(); - } + if (seq instanceof String) { + String rs1 = (String) seq; + print(rs1.getBytes()); + } + } - private static void print(CharSequence seq) { - if(seq instanceof CompactString) { - CompactString cs1 = (CompactString) seq; - print(cs1.getData()); - } + public static void checkDictionarySectionOrder(Iterator it) { + ReplazableString prev = new ReplazableString(); + String lastStr = ""; + while (it.hasNext()) { + ByteString charSeq = ByteString.of(it.next()); + String str = charSeq.toString(); - if(seq instanceof String) { - String rs1 = (String) seq; - print(rs1.getBytes()); - } - } + int cmp = prev.compareTo(charSeq); - public static void checkDictionarySectionOrder(Iterator it) { - CharSequence lastCharseq = null; - String lastStr =null; - int cmp=0, cmp2=0; - while (it.hasNext()) { - CharSequence charSeq = it.next(); - String str = charSeq.toString(); + if (cmp >= 0) { + System.out.println("ERRA: " + prev + " / " + charSeq); + } - if(lastCharseq!=null && ((cmp=comparator.compare(lastCharseq, charSeq))>=0 )) { - System.out.println("ERRA: "+lastCharseq+" / "+charSeq); - } + int cmp2 = lastStr.compareTo(str); - if(lastStr!=null && ((cmp2=lastStr.compareTo(str))>=0)) { - System.out.println("ERRB: "+lastStr+" / "+str); - } + if (cmp2 >= 0) { + System.out.println("ERRB: " + lastStr + " / " + str); + } - if(Math.signum(cmp)!=Math.signum(cmp2)) { - System.out.println("Not equal: "+cmp+" / "+cmp2); - print(lastCharseq); print(charSeq); - print(lastStr); print(str); - } + if (Math.signum(cmp) != Math.signum(cmp2)) { + System.out.println("Not equal: " + cmp + " / " + cmp2); + print(prev); + print(charSeq); + print(lastStr); + print(str); + } - lastCharseq = charSeq; - lastStr = str; - } - } + prev.replace(charSeq); + lastStr = str; + } + } - public static void main(String[] args) throws Throwable { - if(args.length<1) { - System.out.println("hdtVerify "); - System.exit(-1); - } - try (HDT hdt = HDTManager.mapHDT(args[0], null)) { - System.out.println("Checking subject entries"); - checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); - System.out.println("Checking predicate entries"); - checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); - System.out.println("Checking object entries"); - checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); - System.out.println("Checking shared entries"); - checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); - } - } + public static void main(String[] args) throws Throwable { + if (args.length < 1) { + System.out.println("hdtVerify "); + System.exit(-1); + } + try (HDT hdt = HDTManager.mapHDT(args[0], null)) { + System.out.println("Checking subject entries"); + checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); + System.out.println("Checking predicate entries"); + checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); + System.out.println("Checking object entries"); + checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); + System.out.println("Checking shared entries"); + checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); + } + } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index 6aa335cb..33f4e3e8 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -147,9 +147,7 @@ public void execute() throws ParserException, IOException { } else { baseURI = Path.of(rdfInput).toUri().toString(); } - if (!quiet) { - System.out.println("base uri not specified, using '" + baseURI + "'"); - } + warn("base uri not specified, using '" + baseURI + "'"); } RDFNotation notation = null; @@ -157,7 +155,7 @@ public void execute() throws ParserException, IOException { try { notation = RDFNotation.parse(rdfType); } catch (IllegalArgumentException e) { - System.out.println("Notation " + rdfType + " not recognised."); + warn("Notation " + rdfType + " not recognised."); } } @@ -165,11 +163,13 @@ public void execute() throws ParserException, IOException { try { notation = RDFNotation.guess(rdfInput); } catch (IllegalArgumentException e) { - System.out.println("Could not guess notation for " + rdfInput + " Trying NTriples"); + warn("Could not guess notation for " + rdfInput + " Trying NTriples"); notation = RDFNotation.NTRIPLES; } } + log("Converting " +rdfInput+" to "+hdtOutput+" as "+notation.name()); + if (ntSimpleLoading) { spec.set("parser.ntSimpleParser", "true"); } @@ -185,16 +185,12 @@ public void execute() throws ParserException, IOException { long maxTreeCatChunkSize = getMaxTreeCatChunkSize(); - if (!quiet) { - System.out.println("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true)); - } + log("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true)); if (disk) { if (diskLocation != null) { spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, diskLocation); - if (!quiet) { - System.out.println("Using temp directory " + diskLocation); - } + log("Using temp directory " + diskLocation); } MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole(color) : null; hdt = HDTManager.catTree( @@ -222,14 +218,12 @@ public void execute() throws ParserException, IOException { } } else if (disk) { if (!quiet) { - System.out.println("Generating using generateHDTDisk"); + log("Generating using generateHDTDisk"); } spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, hdtOutput); if (diskLocation != null) { spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, diskLocation); - if (!quiet) { - System.out.println("Using temp directory " + diskLocation); - } + log("Using temp directory " + diskLocation); } MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole(color) : null; hdt = HDTManager.generateHDTDisk(rdfInput, baseURI, notation, CompressionType.guess(rdfInput), spec, listenerConsole); @@ -242,30 +236,31 @@ public void execute() throws ParserException, IOException { : null; hdt = HDTManager.generateHDT(rdfInput, baseURI, notation, spec, listenerConsole); } - System.out.println("File converted in: "+sw.stopAndShow()); + + logValue("File converted in ..... ", sw.stopAndShow(), true); try { // Show Basic stats if(!quiet){ - System.out.println("Total Triples: "+hdt.getTriples().getNumberOfElements()); - System.out.println("Different subjects: "+hdt.getDictionary().getNsubjects()); - System.out.println("Different predicates: "+hdt.getDictionary().getNpredicates()); - System.out.println("Different objects: "+hdt.getDictionary().getNobjects()); - System.out.println("Common Subject/Object:"+hdt.getDictionary().getNshared()); + logValue("Total Triples ......... ", "" + hdt.getTriples().getNumberOfElements()); + logValue("Different subjects .... ", "" + hdt.getDictionary().getNsubjects()); + logValue("Different predicates .. ", "" + hdt.getDictionary().getNpredicates()); + logValue("Different objects ..... ", "" + hdt.getDictionary().getNobjects()); + logValue("Common Subject/Object . ", "" + hdt.getDictionary().getNshared()); } // Dump to HDT file if (!disk && !catTree) { sw = new StopWatch(); hdt.saveToHDT(hdtOutput, this); - System.out.println("HDT saved to file in: "+sw.stopAndShow()); + logValue("HDT saved to file in .. ", sw.stopAndShow()); } // Generate index and dump it to .hdt.index file sw.reset(); if(generateIndex) { hdt = HDTManager.indexedHDT(hdt,this); - System.out.println("Index generated and saved in: "+sw.stopAndShow()); + logValue("Index generated and saved in ", sw.stopAndShow()); } } finally { if(hdt!=null) hdt.close(); @@ -285,6 +280,29 @@ public void notifyProgress(float level, String message) { } } + private String prefix(String pref, int r, int g, int b) { + return colorReset() + "[" + color(r, g, b) + pref + colorReset() + "]"; + } + + private void log(String msg) { + if (!quiet) { + System.out.println(prefix("INFO", 3, 1, 5) + " " + colorReset() + msg); + } + } + private void logValue(String msg, String value, boolean ignoreQuiet) { + if (!quiet || ignoreQuiet) { + System.out.println(color(3, 1, 5) + msg + colorReset() + value); + } + } + private void logValue(String msg, String value) { + logValue(msg, value, false); + } + private void warn(String msg) { + if (!quiet) { + System.out.println(prefix("WARN", 5, 5, 0) + " " + colorReset() + msg); + } + } + private String color(int r, int g, int b) { if (!color) { return ""; @@ -314,10 +332,10 @@ public static void main(String[] args) throws Throwable { System.out.println(rdf2hdt.color(3, 1, 5) + "Type: " + rdf2hdt.colorReset() + opt.getKeyInfo().type().getTitle()); switch (opt.getKeyInfo().type()) { case BOOLEAN: - System.out.println(rdf2hdt.color(3, 1, 5) + "Possible value: " + rdf2hdt.colorReset() + "true|false"); + System.out.println(rdf2hdt.color(3, 1, 5) + "Possible values: " + rdf2hdt.colorReset() + "true|false"); break; case ENUM: - System.out.println(rdf2hdt.color(3, 1, 5) + "Possible value:"); + System.out.println(rdf2hdt.color(3, 1, 5) + "Possible value(s):"); int max = opt.getValues().stream().mapToInt(vle -> vle.getValue().length()).max().orElse(0); for (HDTOptionsKeys.OptionValue vle : opt.getValues()) { System.out.print(rdf2hdt.color(3, 3, 3) + "- " + rdf2hdt.colorReset() + vle.getValue()); @@ -354,8 +372,6 @@ public static void main(String[] args) throws Throwable { System.exit(1); } - System.out.println("Converting "+rdf2hdt.rdfInput+" to "+rdf2hdt.hdtOutput+" as "+rdf2hdt.rdfType); - rdf2hdt.execute(); } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java index dca6d866..43424010 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java @@ -2,6 +2,7 @@ import java.util.Map; import java.util.TreeMap; +import java.util.stream.Collectors; import org.rdfhdt.hdt.listener.MultiThreadListener; @@ -58,11 +59,11 @@ public String colorReset() { } public String colorThread() { - return color(5, 1, 1); + return color(3, 1, 5); } public String colorPercentage() { - return color(1, 1, 5); + return color(5, 1, 0); } @Override @@ -85,17 +86,18 @@ public synchronized void unregisterThread(String threadName) { return; } threadMessages.remove(threadName); + threadMessages.put("debug", "size: " + threadMessages.size()); render(); } @Override public synchronized void notifyProgress(String thread, float level, String message) { - String msg = colorPercentage() + "[" + level + "] " + colorReset() + message; + String msg = colorReset() + "[" + colorPercentage() + String.format(level >= 100 ? "%-5.1f" : "%-5.2f", level) + colorReset() + "] " + message; if (threadMessages != null) { threadMessages.put(thread, msg); render(); } else { - System.out.println(colorThread() + "[" + thread + "]" + colorReset() + msg); + System.out.println(colorReset() + "[" + colorThread() + thread + colorReset() + "]" + msg); } } @@ -110,10 +112,14 @@ private void render() { if (previous != 0) { message.append(goBackNLine(previous)); } + + int maxThreadNameSize = threadMessages.keySet().stream().mapToInt(String::length).max().orElse(0) + 1; + // write each thread logs threadMessages.forEach((thread, msg) -> message .append(ERASE_LINE) - .append(colorThread()).append("[").append(thread).append("]") + .append(colorReset()).append("[").append(colorThread()).append(thread).append(colorReset()).append("]") + .append(" ").append(".".repeat(maxThreadNameSize - thread.length())).append(" ") .append(msg).append("\n")); // remove previous printing int toRemove = previous - lines; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryCat.java index 87be2f2b..047119fc 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryCat.java @@ -3,14 +3,15 @@ import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMapping; import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMappingBack; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.string.ByteString; import java.io.Closeable; import java.io.IOException; -import java.util.HashMap; +import java.util.Map; public interface DictionaryCat extends Closeable { void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener listener) throws IOException; CatMappingBack getMappingS(); long getNumShared(); - HashMap getAllMappings(); + Map getAllMappings(); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryDiff.java index c6a7710c..98ad0c9d 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryDiff.java @@ -3,6 +3,7 @@ import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMapping; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.string.ByteString; import java.io.Closeable; import java.io.IOException; @@ -32,5 +33,5 @@ public interface DictionaryDiff extends Closeable { /** * @return the cat mapping for each section */ - Map getAllMappings(); + Map getAllMappings(); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionarySection.java index 51125b28..a49a5b27 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/TempDictionarySection.java @@ -28,6 +28,7 @@ import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.util.string.ByteString; import java.util.Iterator; import java.util.Map; @@ -76,7 +77,7 @@ public interface TempDictionarySection extends DictionarySection { /** * @return the literal counts for MultipleSectionDictionary */ - default Map getLiteralsCounts() { + default Map getLiteralsCounts() { throw new NotImplementedException("getLiteralsCounts()"); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java index 32c12bca..0990d6d9 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java @@ -30,6 +30,7 @@ import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.PrefixListener; +import org.rdfhdt.hdt.util.string.ByteString; import java.io.*; import java.nio.file.Files; @@ -39,7 +40,7 @@ public class FourSectionDictionaryCat implements DictionaryCat { - private final HashMap allMappings = new HashMap<>(); + private final Map allMappings = new HashMap<>(); private final String location; private long numShared; @@ -51,14 +52,14 @@ public FourSectionDictionaryCat(String location) { } public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener listener) throws IOException { - allMappings.put("P1",new CatMapping(location,"P1",dictionary1.getPredicates().getNumberOfElements())); - allMappings.put("P2",new CatMapping(location,"P2",dictionary2.getPredicates().getNumberOfElements())); - allMappings.put("S1",new CatMapping(location,"S1",dictionary1.getSubjects().getNumberOfElements())); - allMappings.put("S2",new CatMapping(location,"S2",dictionary2.getSubjects().getNumberOfElements())); - allMappings.put("O1",new CatMapping(location, "O1",dictionary1.getObjects().getNumberOfElements())); - allMappings.put("O2",new CatMapping(location, "O2",dictionary2.getObjects().getNumberOfElements())); - allMappings.put("SH1",new CatMapping(location,"SH1",dictionary1.getShared().getNumberOfElements())); - allMappings.put("SH2",new CatMapping(location,"SH2",dictionary2.getShared().getNumberOfElements())); + allMappings.put(SectionUtil.P1,new CatMapping(location, SectionUtil.P1, dictionary1.getPredicates().getNumberOfElements())); + allMappings.put(SectionUtil.P2,new CatMapping(location, SectionUtil.P2, dictionary2.getPredicates().getNumberOfElements())); + allMappings.put(SectionUtil.S1,new CatMapping(location, SectionUtil.S1, dictionary1.getSubjects().getNumberOfElements())); + allMappings.put(SectionUtil.S2,new CatMapping(location, SectionUtil.S2, dictionary2.getSubjects().getNumberOfElements())); + allMappings.put(SectionUtil.O1,new CatMapping(location, SectionUtil.O1, dictionary1.getObjects().getNumberOfElements())); + allMappings.put(SectionUtil.O2,new CatMapping(location, SectionUtil.O2, dictionary2.getObjects().getNumberOfElements())); + allMappings.put(SectionUtil.SH1,new CatMapping(location, SectionUtil.SH1, dictionary1.getShared().getNumberOfElements())); + allMappings.put(SectionUtil.SH2,new CatMapping(location, SectionUtil.SH2, dictionary2.getShared().getNumberOfElements())); // System.out.println("PREDICATES-------------------"); ProgressListener iListener; @@ -70,7 +71,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener int numCommonPredicates = 0; - CatIntersection commonP1P2 = new CatIntersection(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1"),new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); + CatIntersection commonP1P2 = new CatIntersection(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),SectionUtil.P1),new CatWrapper(dictionary2.getPredicates().getSortedEntries(),SectionUtil.P2)); while (commonP1P2.hasNext()){ commonP1P2.next(); numCommonPredicates++; @@ -79,8 +80,8 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener long numPredicates = dictionary1.getPredicates().getNumberOfElements()+dictionary2.getPredicates().getNumberOfElements()-numCommonPredicates; ArrayList> addPredicatesList = new ArrayList<>(); - addPredicatesList.add(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1")); - addPredicatesList.add(new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); + addPredicatesList.add(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),SectionUtil.P1)); + addPredicatesList.add(new CatWrapper(dictionary2.getPredicates().getSortedEntries(),SectionUtil.P2)); CatUnion itAddPredicates = new CatUnion(addPredicatesList); SectionUtil.createSection(location,numPredicates, 4,itAddPredicates, new CatUnion(new ArrayList<>()),allMappings,0, iListener); // System.out.println("SUBJECTS-------------------"); @@ -90,10 +91,10 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } ArrayList> skipSubjectList = new ArrayList<>(); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"))); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"),new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1),new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1),new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2),new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2),new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1))); CatUnion skipSubject = new CatUnion(skipSubjectList); int numSkipSubjects = 0; while (skipSubject.hasNext()){ @@ -101,7 +102,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener numSkipSubjects++; } int numCommonSubjects = 0; - CatIntersection commonS1S2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2")); + CatIntersection commonS1S2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1),new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2)); while (commonS1S2.hasNext()){ commonS1S2.next(); numCommonSubjects++; @@ -110,15 +111,15 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener skipSubjectList = new ArrayList<>(); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"))); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"),new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1),new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1),new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2),new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2),new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1))); skipSubject = new CatUnion(skipSubjectList); ArrayList> addSubjectsList = new ArrayList<>(); - addSubjectsList.add(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1")); - addSubjectsList.add(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2")); + addSubjectsList.add(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1)); + addSubjectsList.add(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2)); CatUnion itAddSubjects = new CatUnion(addSubjectsList); SectionUtil.createSection(location,numSubjects, 2,itAddSubjects,skipSubject ,allMappings,0, iListener); @@ -129,10 +130,10 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener iListener.notifyProgress(0, "start"); } ArrayList> skipObjectsList = new ArrayList<>(); - skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"),new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"))); - skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); - skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"),new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"))); + skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1),new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1),new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2))); + skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2),new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); + skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2),new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1))); CatUnion skipObject = new CatUnion(skipObjectsList); int numSkipObjects = 0; @@ -142,7 +143,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } int numCommonObjects = 0; - CatIntersection commonO1O2 = new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"),new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2")); + CatIntersection commonO1O2 = new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1),new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2)); while (commonO1O2.hasNext()){ commonO1O2.next(); numCommonObjects++; @@ -150,17 +151,17 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener skipObjectsList = new ArrayList<>(); - skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"),new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"))); - skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); - skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"),new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"))); + skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1),new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1),new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2))); + skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2),new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); + skipObjectsList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2),new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1))); skipObject = new CatUnion(skipObjectsList); long numObject = dictionary1.getObjects().getNumberOfElements()+dictionary2.getObjects().getNumberOfElements()-numCommonObjects-numSkipObjects; ArrayList> addObjectsList = new ArrayList<>(); - addObjectsList.add(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1")); - addObjectsList.add(new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2")); + addObjectsList.add(new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1)); + addObjectsList.add(new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2)); CatUnion itAddObjects = new CatUnion(addObjectsList); SectionUtil.createSection(location,numObject, 3,itAddObjects,skipObject ,allMappings,0, iListener); @@ -170,20 +171,20 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener if (iListener != null) { iListener.notifyProgress(0, "start"); } - CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"), new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2")); + CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1), new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2)); int numCommonS1O2=0; while (i2.hasNext()){ i2.next(); numCommonS1O2++; } - i2 = new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"), new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2")); + i2 = new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1), new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2)); int numCommonO1S2=0; while (i2.hasNext()){ i2.next(); numCommonO1S2++; } - i2 = new CatIntersection(new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"),new CatWrapper( dictionary2.getShared().getSortedEntries(),"SH2")); + i2 = new CatIntersection(new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1),new CatWrapper( dictionary2.getShared().getSortedEntries(),SectionUtil.SH2)); int numCommonSh1Sh2=0; while (i2.hasNext()){ i2.next(); @@ -192,15 +193,15 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener numShared = dictionary1.getShared().getNumberOfElements()+dictionary2.getShared().getNumberOfElements()-numCommonSh1Sh2+numCommonS1O2+numCommonO1S2; ArrayList> addSharedList = new ArrayList<>(); - addSharedList.add(new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1")); - addSharedList.add(new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2")); + addSharedList.add(new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1)); + addSharedList.add(new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2)); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"))); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"),new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"))); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),"O2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1),new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1),new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2),new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2),new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),SectionUtil.O1),new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getObjects().getSortedEntries(),SectionUtil.O2),new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); CatUnion itAddShared = new CatUnion(addSharedList); SectionUtil.createSection(location,numShared, 1,itAddShared,new CatUnion(new ArrayList<>()) ,allMappings,0, iListener); @@ -228,27 +229,27 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener //calculate the inverse mapping for the subjects, i.e. from the new dictionary subject section to the old ones mappingS = new CatMappingBack(location,numSubjects+numShared); - for (int i=0; i getAllMappings() { + @Override + public Map getAllMappings() { return allMappings; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java index 23f6fe0e..9a0c7a9c 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java @@ -14,6 +14,7 @@ import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.string.ByteString; import java.io.FileOutputStream; import java.io.IOException; @@ -31,7 +32,7 @@ public class FourSectionDictionaryDiff implements DictionaryDiff { private final String location; - private final Map allMappings = new HashMap<>(); + private final Map allMappings = new HashMap<>(); private CatMapping mappingBack; public long numShared; @@ -50,17 +51,17 @@ public void close() throws IOException { } @Override public void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException { - allMappings.put("predicate", new CatMapping(location, "predicate", dictionary.getPredicates().getNumberOfElements())); - allMappings.put("subject", new CatMapping(location, "subject", dictionary.getSubjects().getNumberOfElements())); - allMappings.put("object", new CatMapping(location, "object", dictionary.getObjects().getNumberOfElements())); - allMappings.put("shared", new CatMapping(location, "shared", dictionary.getShared().getNumberOfElements())); + allMappings.put(SectionUtil.SECTION_PREDICATE, new CatMapping(location, SectionUtil.SECTION_PREDICATE, dictionary.getPredicates().getNumberOfElements())); + allMappings.put(SectionUtil.SECTION_SUBJECT, new CatMapping(location, SectionUtil.SECTION_SUBJECT, dictionary.getSubjects().getNumberOfElements())); + allMappings.put(SectionUtil.SECTION_OBJECT, new CatMapping(location, SectionUtil.SECTION_OBJECT, dictionary.getObjects().getNumberOfElements())); + allMappings.put(SectionUtil.SECTION_SHARED, new CatMapping(location, SectionUtil.SECTION_SHARED, dictionary.getShared().getNumberOfElements())); // allMappings.put("shared_o",new CatMapping(location,"shared_o",dictionary.getShared().getNumberOfElements())); // Predicates Bitmap predicatesBitMap = bitmaps.get("P"); Iterator predicates = dictionary.getPredicates().getSortedEntries(); - DiffWrapper itSkipPreds = new DiffWrapper(predicates, predicatesBitMap, "predicate"); + DiffWrapper itSkipPreds = new DiffWrapper(predicates, predicatesBitMap, SectionUtil.SECTION_PREDICATE); ArrayList> listSkipPred = new ArrayList<>(); listSkipPred.add(itSkipPreds); @@ -74,7 +75,7 @@ public void diff(Dictionary dictionary, Map bitm Bitmap subjectsBitMap = bitmaps.get("S"); Iterator subjects = dictionary.getSubjects().getSortedEntries(); - DiffWrapper itSkipSubs = new DiffWrapper(subjects, subjectsBitMap, "subject"); + DiffWrapper itSkipSubs = new DiffWrapper(subjects, subjectsBitMap, SectionUtil.SECTION_SUBJECT); List> listSkipSubj = new ArrayList<>(); listSkipSubj.add(itSkipSubs); @@ -93,7 +94,7 @@ public void diff(Dictionary dictionary, Map bitm Bitmap objectsBitMap = bitmaps.get("O"); Iterator objects = dictionary.getObjects().getSortedEntries(); - DiffWrapper itSkipObjs = new DiffWrapper(objects, objectsBitMap, "object"); + DiffWrapper itSkipObjs = new DiffWrapper(objects, objectsBitMap, SectionUtil.SECTION_OBJECT); ArrayList> listSkipObjs = new ArrayList<>(); listSkipObjs.add(itSkipObjs); @@ -113,11 +114,11 @@ public void diff(Dictionary dictionary, Map bitm Iterator shared = dictionary.getShared().getSortedEntries(); - DiffWrapper sharedSubj = new DiffWrapper(shared, sharedSubjBitMap, "shared"); + DiffWrapper sharedSubj = new DiffWrapper(shared, sharedSubjBitMap, SectionUtil.SECTION_SHARED); shared = dictionary.getShared().getSortedEntries(); - DiffWrapper sharedObj = new DiffWrapper(shared, sharedObjBitMap, "shared"); + DiffWrapper sharedObj = new DiffWrapper(shared, sharedObjBitMap, SectionUtil.SECTION_SHARED); ArrayList> listShared = new ArrayList<>(); listShared.add(new CatIntersection(sharedSubj, sharedObj)); @@ -129,9 +130,9 @@ public void diff(Dictionary dictionary, Map bitm } listShared = new ArrayList<>(); - sharedSubj = new DiffWrapper(dictionary.getShared().getSortedEntries(), sharedSubjBitMap, "shared"); + sharedSubj = new DiffWrapper(dictionary.getShared().getSortedEntries(), sharedSubjBitMap, SectionUtil.SECTION_SHARED); - sharedObj = new DiffWrapper(dictionary.getShared().getSortedEntries(), sharedObjBitMap, "shared"); + sharedObj = new DiffWrapper(dictionary.getShared().getSortedEntries(), sharedObjBitMap, SectionUtil.SECTION_SHARED); listShared.add(new CatIntersection(sharedSubj, sharedObj)); SectionUtil.createSection(location, numShared, 1, new CatUnion(listShared), new CatUnion(new ArrayList<>()), allMappings, 0, listener); @@ -158,24 +159,26 @@ public void diff(Dictionary dictionary, Map bitm Files.delete(Paths.get(location + "section" + j)); } } - mappingBack = new CatMapping(location, "back", numSubj + numShared); + mappingBack = new CatMapping(location, SectionUtil.BACK, numSubj + numShared); if (mappingBack.getSize() > 0) { - for (int i = 0; i < allMappings.get("shared").getSize(); i++) { - long type = allMappings.get("shared").getType(i); + CatMapping sharedMapping = allMappings.get(SectionUtil.SECTION_SHARED); + for (int i = 0; i < sharedMapping.getSize(); i++) { + long type = sharedMapping.getType(i); if (type == 1) { - mappingBack.set(allMappings.get("shared").getMapping(i) - 1, i + 1, 1); + mappingBack.set(sharedMapping.getMapping(i) - 1, i + 1, 1); } else if (type == 2) { - mappingBack.set(allMappings.get("shared").getMapping(i) + numShared - 1, i + 1, 2); + mappingBack.set(sharedMapping.getMapping(i) + numShared - 1, i + 1, 2); } } - for (int i = 0; i < allMappings.get("subject").getSize(); i++) { - long type = allMappings.get("subject").getType(i); + CatMapping subjectMapping = allMappings.get(SectionUtil.SECTION_SUBJECT); + for (int i = 0; i < subjectMapping.getSize(); i++) { + long type = subjectMapping.getType(i); if (type == 1) { - mappingBack.set(allMappings.get("subject").getMapping(i) - 1, (i + 1 + (int) dictionary.getNshared()), 1); + mappingBack.set(subjectMapping.getMapping(i) - 1, (i + 1 + (int) dictionary.getNshared()), 1); } else if (type == 2) { - mappingBack.set(allMappings.get("subject").getMapping(i) + numShared - 1, (i + 1 + (int) dictionary.getNshared()), 2); + mappingBack.set(subjectMapping.getMapping(i) + numShared - 1, (i + 1 + (int) dictionary.getNshared()), 2); } } } @@ -200,10 +203,10 @@ public SharedWrapper(int flag, Bitmap bitmapSub, Bitmap bitmapObj, Iterator IDs = new ArrayList<>(); - IDs.add(new CatElement.IteratorPlusPosition("shared", count + 1)); + IDs.add(new CatElement.IteratorPlusPosition(SectionUtil.SECTION_SHARED, count + 1)); next = new CatElement(element, IDs); count++; return true; @@ -230,7 +233,7 @@ public int count() { } @Override - public Map getAllMappings() { + public Map getAllMappings() { return allMappings; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java index 3e552bf0..91633e4e 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java @@ -3,8 +3,10 @@ import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySectionMap; import org.rdfhdt.hdt.dictionary.impl.section.PFCOptimizedExtractor; +import org.rdfhdt.hdt.dictionary.impl.utilCat.SectionUtil; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import java.util.AbstractMap; @@ -14,7 +16,7 @@ public class MultDictionaryPFCOptimizedExtractor implements OptimizedExtractor{ private final PFCOptimizedExtractor shared, subjects, predicates; - private final TreeMap objects; + private final TreeMap objects; private final long numshared; public MultDictionaryPFCOptimizedExtractor(MultipleSectionDictionary origDict) { @@ -23,20 +25,20 @@ public MultDictionaryPFCOptimizedExtractor(MultipleSectionDictionary origDict) { subjects = new PFCOptimizedExtractor((PFCDictionarySectionMap) origDict.subjects); predicates = new PFCOptimizedExtractor((PFCDictionarySectionMap) origDict.predicates); objects = new TreeMap<>(CharSequenceComparator.getInstance()); - for (Map.Entry entry : origDict.getAllObjects().entrySet()) { - objects.put(entry.getKey(), new PFCOptimizedExtractor((PFCDictionarySectionMap) entry.getValue())); + for (Map.Entry entry : origDict.getAllObjects().entrySet()) { + objects.put(ByteString.of(entry.getKey()), new PFCOptimizedExtractor((PFCDictionarySectionMap) entry.getValue())); } } @Override public CharSequence idToString(long id, TripleComponentRole role) { - AbstractMap.SimpleEntry section = getSection(id, role); + AbstractMap.SimpleEntry section = getSection(id, role); long localId = getLocalId(id, role); - if(section.getKey().equals(LiteralsUtils.NO_DATATYPE_STR) || section.getKey().equals("section")) + if(section.getKey().equals(LiteralsUtils.NO_DATATYPE) || section.getKey().equals(SectionUtil.SECTION)) return section.getValue().extract(localId); else { String label = section.getValue().extract(localId).toString(); - String dType = section.getKey(); + ByteString dType = section.getKey(); //Matcher matcher = pattern.matcher(label); if(LiteralsUtils.containsLanguage(label)){ return label; @@ -45,32 +47,32 @@ public CharSequence idToString(long id, TripleComponentRole role) { } } } - private AbstractMap.SimpleEntry getSection(long id, TripleComponentRole role) { + private AbstractMap.SimpleEntry getSection(long id, TripleComponentRole role) { switch (role) { case SUBJECT: if(id<=numshared) { - return new AbstractMap.SimpleEntry<>("section",shared); + return new AbstractMap.SimpleEntry<>(SectionUtil.SECTION,shared); } else { - return new AbstractMap.SimpleEntry<>("section",subjects); + return new AbstractMap.SimpleEntry<>(SectionUtil.SECTION,subjects); } case PREDICATE: - return new AbstractMap.SimpleEntry<>("section",predicates); + return new AbstractMap.SimpleEntry<>(SectionUtil.SECTION,predicates); case OBJECT: if(id<= numshared) { - return new AbstractMap.SimpleEntry<>("section",shared); + return new AbstractMap.SimpleEntry<>(SectionUtil.SECTION,shared); } else { - Iterator> hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section PFCOptimizedExtractor desiredSection = null; - String type = ""; + ByteString type = ByteString.empty(); int count = 0; while (hmIterator.hasNext()) { - Map.Entry entry = hmIterator.next(); + Map.Entry entry = hmIterator.next(); PFCOptimizedExtractor subSection = entry.getValue(); count+= subSection.getNumStrings(); if(id <= numshared+count){ desiredSection = subSection; - type = entry.getKey().toString(); + type = entry.getKey(); break; } } @@ -92,11 +94,11 @@ private long getLocalId(long id, TripleComponentRole position) { if(id<=numshared) { return id; } else { - Iterator> hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section long count = 0; while (hmIterator.hasNext()){ - Map.Entry entry = hmIterator.next(); + Map.Entry entry = hmIterator.next(); PFCOptimizedExtractor subSection = entry.getValue(); count+= subSection.getNumStrings(); if(id <= numshared + count){ diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java index 15f364e7..4b6d819d 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java @@ -5,14 +5,15 @@ import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.impl.utilCat.SectionUtil; import org.rdfhdt.hdt.enums.DictionarySectionRole; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; -import org.rdfhdt.hdt.util.string.CompactString; import java.util.AbstractMap; import java.util.Iterator; @@ -20,13 +21,11 @@ import java.util.TreeMap; public abstract class MultipleBaseDictionary implements DictionaryPrivate { - private static final CharSequence SECTION = new CompactString("section"); - protected final HDTOptions spec; protected DictionarySectionPrivate subjects; protected DictionarySectionPrivate predicates; - protected TreeMap objects; + protected TreeMap objects; protected DictionarySectionPrivate shared; public MultipleBaseDictionary(HDTOptions spec) { @@ -38,11 +37,11 @@ protected long getGlobalId(long id, DictionarySectionRole position, CharSequence case SUBJECT: return id + shared.getNumberOfElements(); case OBJECT: { - Iterator> iter = objects.entrySet().iterator(); + Iterator> iter = objects.entrySet().iterator(); int count = 0; - CharSequence type = LiteralsUtils.getType(ByteStringUtil.asByteString(str)); + ByteString type = (ByteString) LiteralsUtils.getType(ByteStringUtil.asByteString(str)); while (iter.hasNext()) { - Map.Entry entry = iter.next(); + Map.Entry entry = iter.next(); count+= entry.getValue().getNumberOfElements(); if(type.equals(entry.getKey())) { count -= entry.getValue().getNumberOfElements(); @@ -76,11 +75,11 @@ protected long getLocalId(long id, TripleComponentRole position) { if(id<=shared.getNumberOfElements()) { return id; } else { - Iterator> hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section long count = 0; while (hmIterator.hasNext()) { - Map.Entry entry = hmIterator.next(); + Map.Entry entry = hmIterator.next(); long numElts; //what??? @@ -108,13 +107,13 @@ protected long getLocalId(long id, TripleComponentRole position) { * @see hdt.dictionary.Dictionary#stringToId(java.lang.CharSequence, datatypes.TripleComponentRole) */ @Override - public long stringToId(CharSequence str, TripleComponentRole position) { - str = ByteStringUtil.asByteString(str); - - if (str == null || str.length() == 0) { + public long stringToId(CharSequence sstr, TripleComponentRole position) { + if (sstr == null || sstr.length() == 0) { return 0; } + ByteString str = ByteString.of(sstr); + long ret; switch(position) { case SUBJECT: @@ -222,24 +221,24 @@ private AbstractMap.SimpleEntry getSectio switch (role) { case SUBJECT: if(id<=shared.getNumberOfElements()) { - return new AbstractMap.SimpleEntry<>(SECTION,shared); + return new AbstractMap.SimpleEntry<>(SectionUtil.SECTION,shared); } else { - return new AbstractMap.SimpleEntry<>(SECTION,subjects); + return new AbstractMap.SimpleEntry<>(SectionUtil.SECTION,subjects); } case PREDICATE: - return new AbstractMap.SimpleEntry<>(SECTION,predicates); + return new AbstractMap.SimpleEntry<>(SectionUtil.SECTION,predicates); case OBJECT: if(id<=shared.getNumberOfElements()) { - return new AbstractMap.SimpleEntry<>(SECTION,shared); + return new AbstractMap.SimpleEntry<>(SectionUtil.SECTION,shared); } else { - Iterator> hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); // iterate over all subsections in the objects section DictionarySectionPrivate desiredSection = null; - CharSequence type = CompactString.EMPTY; + ByteString type = ByteString.empty(); int count = 0; while (hmIterator.hasNext()){ - Map.Entry entry = hmIterator.next(); + Map.Entry entry = hmIterator.next(); DictionarySectionPrivate subSection = entry.getValue(); count += subSection.getNumberOfElements(); if(id <= shared.getNumberOfElements()+ count){ @@ -261,7 +260,7 @@ private AbstractMap.SimpleEntry getSectio public CharSequence idToString(long id, TripleComponentRole role) { AbstractMap.SimpleEntry section = getSection(id, role); long localId = getLocalId(id, role); - if(section.getKey().equals(LiteralsUtils.NO_DATATYPE) || section.getKey().equals(SECTION)) + if(section.getKey().equals(LiteralsUtils.NO_DATATYPE) || section.getKey().equals(SectionUtil.SECTION)) return section.getValue().extract(localId); else { if(section.getValue() == null) { @@ -280,8 +279,8 @@ public CharSequence idToString(long id, TripleComponentRole role) { } } } - private DictionarySectionPrivate getSubSection(CharSequence str){ - return objects.get(LiteralsUtils.getType(str)); + private DictionarySectionPrivate getSubSection(ByteString str){ + return objects.get((ByteString) LiteralsUtils.getType(str)); } @Override public CharSequence dataTypeOfId(long id) { @@ -289,12 +288,12 @@ public CharSequence dataTypeOfId(long id) { } public AbstractMap.SimpleEntry getDataTypeRange(CharSequence dataType){ - CharSequence seq = LiteralsUtils.embed(ByteStringUtil.asByteString(dataType)); + ByteString seq = LiteralsUtils.embed(ByteStringUtil.asByteString(dataType)); if(objects.containsKey(seq)) { // literals subsection exist - Iterator> iter = objects.entrySet().iterator(); + Iterator> iter = objects.entrySet().iterator(); int count = 0; while (iter.hasNext()) { - Map.Entry entry = iter.next(); + Map.Entry entry = iter.next(); count += entry.getValue().getNumberOfElements(); if (seq.equals(entry.getKey())) { count -= entry.getValue().getNumberOfElements(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java index 4f891d8f..48a34a41 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java @@ -19,7 +19,7 @@ import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; -import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CompactString; @@ -58,12 +58,12 @@ public void load(TempDictionary other, ProgressListener listener) { Iterator iter = other.getObjects().getEntries(); // TODO: allow the usage of OneReadDictionarySection - Map literalsCounts = new HashMap<>(other.getObjects().getLiteralsCounts()); + Map literalsCounts = new HashMap<>(other.getObjects().getLiteralsCounts()); literalsCounts.computeIfPresent(LiteralsUtils.NO_DATATYPE, (key, value) -> (value - other.getShared().getNumberOfElements())); CustomIterator customIterator = new CustomIterator(iter, literalsCounts); while (customIterator.hasNext()) { PFCDictionarySection section = new PFCDictionarySection(spec); - CharSequence type = LiteralsUtils.getType(customIterator.prev); + ByteString type = ByteString.of(LiteralsUtils.getType(customIterator.prev)); long numEntries = literalsCounts.get(type); section.load(customIterator, numEntries, listener); @@ -98,48 +98,45 @@ public void save(OutputStream output, ControlInfo ci, ProgressListener listener) ------------------ */ private void writeLiteralsMap(OutputStream output, ProgressListener listener) throws IOException { - Iterator> hmIterator = objects.entrySet().iterator(); + Iterator> hmIterator = objects.entrySet().iterator(); int numberOfTypes = objects.size(); VByte.encode(output, numberOfTypes); - List types = new ArrayList<>(); + List types = new ArrayList<>(); while (hmIterator.hasNext()) { - Map.Entry entry = hmIterator.next(); - CharSequence uri = entry.getKey(); - String uriStr = uri.toString(); - byte[] bytes = uriStr.getBytes(); - VByte.encode(output, bytes.length); - IOUtil.writeBuffer(output, bytes, 0, bytes.length, listener); + Map.Entry entry = hmIterator.next(); + ByteString uri = entry.getKey(); + IOUtil.writeSizedBuffer(output, uri.getBuffer(), 0, uri.length(), listener); types.add(uri); } - for (CharSequence type : types) { + for (ByteString type : types) { this.objects.get(type).save(output, listener); } } private void readLiteralsMap(InputStream input, ProgressListener listener) throws IOException { int numberOfTypes = (int) VByte.decode(input); - List types = new ArrayList<>(); + List types = new ArrayList<>(); for (int i = 0; i < numberOfTypes; i++) { int length = (int) VByte.decode(input); byte[] type = IOUtil.readBuffer(input, length, listener); types.add(new CompactString(type)); } - for (CharSequence type : types) { + for (ByteString type : types) { this.objects.put(type, DictionarySectionFactory.loadFrom(input, listener)); } } private void mapLiteralsMap(CountInputStream input, File f, ProgressListener listener) throws IOException { int numberOfTypes = (int) VByte.decode(input); - List types = new ArrayList<>(); + List types = new ArrayList<>(); for (int i = 0; i < numberOfTypes; i++) { int length = (int) VByte.decode(input); byte[] type = IOUtil.readBuffer(input, length, listener); types.add(new CompactString(type)); } - for (CharSequence type : types) { + for (ByteString type : types) { this.objects.put(type, DictionarySectionFactory.loadFrom(input, f, listener)); } @@ -190,7 +187,7 @@ public long getNAllObjects() { } @Override - public TreeMap getAllObjects() { + public TreeMap getAllObjects() { return new TreeMap<>(objects); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java index 3a058f1e..5830fb90 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java @@ -18,6 +18,7 @@ import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CompactString; @@ -27,6 +28,7 @@ import java.io.OutputStream; import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.TreeMap; @@ -53,13 +55,13 @@ public void load(TempDictionary other, ProgressListener listener) { predicates.load(other.getPredicates(), iListener); Iterator iter = other.getObjects().getEntries(); - Map literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); + Map literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); literalsCounts.computeIfPresent(LiteralsUtils.NO_DATATYPE, (key, value) -> (value - other.getShared().getNumberOfElements())); CustomIterator customIterator = new CustomIterator(iter,literalsCounts); while (customIterator.hasNext()){ PFCDictionarySectionBig section = new PFCDictionarySectionBig(spec); - String type = LiteralsUtils.getType(customIterator.prev).toString(); + ByteString type = (ByteString) LiteralsUtils.getType(customIterator.prev); long numEntries = literalsCounts.get(type); section.load(customIterator,numEntries,listener); @@ -95,33 +97,33 @@ private void writeLiteralsMap(OutputStream output,ProgressListener listener) thr int numberOfTypes = objects.size(); VByte.encode(output, numberOfTypes); - ArrayList types = new ArrayList<>(); + List types = new ArrayList<>(); - for (CharSequence uriKey : objects.keySet()) { + for (ByteString uriKey : objects.keySet()) { IOUtil.writeSizedBuffer(output, uriKey.toString().getBytes(), listener); types.add(uriKey); } - for(CharSequence type:types){ + for(ByteString type : types){ this.objects.get(type).save(output,listener); } } private void readLiteralsMap(InputStream input,ProgressListener listener) throws IOException { int numberOfTypes = (int) VByte.decode(input); - ArrayList types = new ArrayList<>(); + List types = new ArrayList<>(); for (int i = 0; i < numberOfTypes; i++) { types.add(new CompactString(IOUtil.readSizedBuffer(input, listener))); } - for(CharSequence type : types){ + for(ByteString type : types){ this.objects.put(type,DictionarySectionFactory.loadFrom(input,listener)); } } private void mapLiteralsMap(CountInputStream input,File f,ProgressListener listener) throws IOException { int numberOfTypes = (int) VByte.decode(input); - ArrayList types = new ArrayList<>(); + List types = new ArrayList<>(); for (int i = 0; i < numberOfTypes; i++) { types.add(new CompactString(IOUtil.readSizedBuffer(input, listener))); } - for(CharSequence type : types){ + for(ByteString type : types){ this.objects.put(type,DictionarySectionFactory.loadFrom(input,f,listener)); } @@ -172,7 +174,7 @@ public long getNAllObjects() { } @Override - public Map getAllObjects() { + public Map getAllObjects() { return new TreeMap<>(objects); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java index 8ec21b20..41bf7c69 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java @@ -31,6 +31,7 @@ import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMappingBack; import org.rdfhdt.hdt.dictionary.impl.utilCat.CatUnion; import org.rdfhdt.hdt.dictionary.impl.utilCat.CatWrapper; +import org.rdfhdt.hdt.dictionary.impl.utilCat.SectionUtil; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; @@ -42,6 +43,7 @@ import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.ListenerUtil; import org.rdfhdt.hdt.util.listener.PrefixListener; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CompactString; @@ -56,17 +58,20 @@ import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; +import java.util.List; import java.util.Map; public class MultipleSectionDictionaryCat implements DictionaryCat { private static final int DEFAULT_BLOCK_SIZE = 16; private static final int BLOCK_PER_BUFFER = 1000000; - private static final CharSequence NO_DT_OBJECTS = LiteralsUtils.NO_DATATYPE; + private static final ByteString NO_DT_OBJECTS = LiteralsUtils.NO_DATATYPE; + private static final ByteString NO_DT_OBJECTS_1 = NO_DT_OBJECTS.copyAppend("1"); + private static final ByteString NO_DT_OBJECTS_2 = NO_DT_OBJECTS.copyAppend("2"); private final String location; private long numShared; - private final HashMap allMappings = new HashMap<>(); + private final HashMap allMappings = new HashMap<>(); private CatMappingBack mappingS; public MultipleSectionDictionaryCat(String location) { @@ -77,36 +82,43 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener Comparator comparator = CharSequenceComparator.getInstance(); // Initialize all mappings ...... - allMappings.put("P1",new CatMapping(location,"P1",dictionary1.getPredicates().getNumberOfElements())); - allMappings.put("P2",new CatMapping(location,"P2",dictionary2.getPredicates().getNumberOfElements())); - allMappings.put("S1",new CatMapping(location,"S1",dictionary1.getSubjects().getNumberOfElements())); - allMappings.put("S2",new CatMapping(location,"S2",dictionary2.getSubjects().getNumberOfElements())); - allMappings.put("O1",new CatMapping(location, "O1",dictionary1.getNAllObjects())); - allMappings.put("O2",new CatMapping(location, "O2",dictionary2.getNAllObjects())); - allMappings.put("SH1",new CatMapping(location,"SH1",dictionary1.getShared().getNumberOfElements())); - allMappings.put("SH2",new CatMapping(location,"SH2",dictionary2.getShared().getNumberOfElements())); - Iterator> hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); + allMappings.put(SectionUtil.P1, new CatMapping(location, SectionUtil.P1, dictionary1.getPredicates().getNumberOfElements())); + allMappings.put(SectionUtil.P2, new CatMapping(location, SectionUtil.P2, dictionary2.getPredicates().getNumberOfElements())); + allMappings.put(SectionUtil.S1, new CatMapping(location, SectionUtil.S1, dictionary1.getSubjects().getNumberOfElements())); + allMappings.put(SectionUtil.S2, new CatMapping(location, SectionUtil.S2, dictionary2.getSubjects().getNumberOfElements())); + allMappings.put(SectionUtil.O1, new CatMapping(location, SectionUtil.O1, dictionary1.getNAllObjects())); + allMappings.put(SectionUtil.O2, new CatMapping(location, SectionUtil.O2, dictionary2.getNAllObjects())); + allMappings.put(SectionUtil.SH1, new CatMapping(location, SectionUtil.SH1, dictionary1.getShared().getNumberOfElements())); + allMappings.put(SectionUtil.SH2, new CatMapping(location, SectionUtil.SH2, dictionary2.getShared().getNumberOfElements())); + Map allObjects1 = dictionary1.getAllObjects(); + Iterator> hmIterator1 = allObjects1.entrySet().iterator(); int countSubSections1 = 0; int countSubSections2 = 0; while (hmIterator1.hasNext()){ Map.Entry entry = hmIterator1.next(); - String prefix = "sub"+countSubSections1; + ByteString prefix; if((entry.getKey()).equals(NO_DT_OBJECTS)) { - prefix = entry.getKey().toString(); + prefix = NO_DT_OBJECTS; + } else { + prefix = SectionUtil.createSub(countSubSections1); } - allMappings.put(prefix+"1",new CatMapping(location,prefix+"1", - entry.getValue().getNumberOfElements())); + prefix = prefix.copyAppend("1"); + allMappings.put(prefix,new CatMapping(location,prefix, entry.getValue().getNumberOfElements())); countSubSections1++; } - Iterator> hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); + Map allObjects2 = dictionary2.getAllObjects(); + Iterator> hmIterator2 = allObjects2.entrySet().iterator(); while (hmIterator2.hasNext()){ Map.Entry entry = hmIterator2.next(); - String prefix = "sub"+countSubSections2; + ByteString prefix; if((entry.getKey()).equals(NO_DT_OBJECTS)) { - prefix = entry.getKey().toString(); + prefix = NO_DT_OBJECTS; + } else { + prefix = SectionUtil.createSub(countSubSections2); } - allMappings.put(prefix+"2",new CatMapping(location,prefix+"2", entry.getValue().getNumberOfElements())); + prefix = prefix.copyAppend("2"); + allMappings.put(prefix,new CatMapping(location,prefix, entry.getValue().getNumberOfElements())); countSubSections2++; } @@ -121,17 +133,17 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener int numCommonPredicates = 0; - CatIntersection commonP1P2 = new CatIntersection(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1"), - new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); + CatIntersection commonP1P2 = new CatIntersection(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),SectionUtil.P1), + new CatWrapper(dictionary2.getPredicates().getSortedEntries(),SectionUtil.P2)); while (commonP1P2.hasNext()){ commonP1P2.next(); numCommonPredicates++; } long numPredicates = dictionary1.getPredicates().getNumberOfElements()+dictionary2.getPredicates().getNumberOfElements()-numCommonPredicates; - ArrayList> addPredicatesList = new ArrayList<>(); - addPredicatesList.add(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1")); - addPredicatesList.add(new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); + List> addPredicatesList = new ArrayList<>(); + addPredicatesList.add(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),SectionUtil.P1)); + addPredicatesList.add(new CatWrapper(dictionary2.getPredicates().getSortedEntries(),SectionUtil.P2)); CatUnion itAddPredicates = new CatUnion(addPredicatesList); catSection(numPredicates, 3,itAddPredicates, new CatUnion(new ArrayList<>()),allMappings, iListener); @@ -144,16 +156,18 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener ArrayList> skipSubjectList = new ArrayList<>(); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"), - new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - if(dictionary2.getAllObjects().containsKey(NO_DT_OBJECTS)) - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"), - new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(),NO_DT_OBJECTS+"2"))); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"), - new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); - if(dictionary1.getAllObjects().containsKey(NO_DT_OBJECTS)) - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"), - new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(),NO_DT_OBJECTS+"1"))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1), + new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + if(allObjects2.containsKey(NO_DT_OBJECTS)) { + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(), SectionUtil.S1), + new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2))); + } + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2), + new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); + if(allObjects1.containsKey(NO_DT_OBJECTS)) { + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(), SectionUtil.S2), + new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1))); + } CatUnion skipSubject = new CatUnion(skipSubjectList); int numSkipSubjects = 0; while (skipSubject.hasNext()){ @@ -161,8 +175,8 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener numSkipSubjects++; } int numCommonSubjects = 0; - CatIntersection commonS1S2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"), - new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2")); + CatIntersection commonS1S2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1), + new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2)); while (commonS1S2.hasNext()){ commonS1S2.next(); numCommonSubjects++; @@ -171,21 +185,21 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener skipSubjectList = new ArrayList<>(); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"), - new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - if(dictionary2.getAllObjects().containsKey(NO_DT_OBJECTS)) - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"), - new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(),NO_DT_OBJECTS+"2"))); - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"), - new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); - if(dictionary1.getAllObjects().containsKey(NO_DT_OBJECTS)) - skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"), - new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(),NO_DT_OBJECTS+"1"))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1), + new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + if(allObjects2.containsKey(NO_DT_OBJECTS)) + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(), SectionUtil.S1), + new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2))); + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2), + new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); + if(allObjects1.containsKey(NO_DT_OBJECTS)) + skipSubjectList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2), + new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1))); skipSubject = new CatUnion(skipSubjectList); ArrayList> addSubjectsList = new ArrayList<>(); - addSubjectsList.add(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1")); - addSubjectsList.add(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2")); + addSubjectsList.add(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1)); + addSubjectsList.add(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2)); CatUnion itAddSubjects = new CatUnion(addSubjectsList); catSection(numSubjects, 2,itAddSubjects,skipSubject ,allMappings, iListener); @@ -197,24 +211,24 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } ArrayList> skipObjectsList = new ArrayList<>(); - if(dictionary1.getAllObjects().containsKey(NO_DT_OBJECTS)) { + if(allObjects1.containsKey(NO_DT_OBJECTS)) { skipObjectsList.add(new CatIntersection( - new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "1"), - new CatWrapper(dictionary2.getShared().getSortedEntries(), "SH2")) + new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1), + new CatWrapper(dictionary2.getShared().getSortedEntries(), SectionUtil.SH2)) ); skipObjectsList.add(new CatIntersection( - new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "1"), - new CatWrapper(dictionary2.getSubjects().getSortedEntries(), "S2")) + new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1), + new CatWrapper(dictionary2.getSubjects().getSortedEntries(), SectionUtil.S2)) ); } - if(dictionary2.getAllObjects().containsKey(NO_DT_OBJECTS)) { + if(allObjects2.containsKey(NO_DT_OBJECTS)) { skipObjectsList.add(new CatIntersection( - new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "2"), - new CatWrapper(dictionary1.getShared().getSortedEntries(), "SH1")) + new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2), + new CatWrapper(dictionary1.getShared().getSortedEntries(), SectionUtil.SH1)) ); skipObjectsList.add(new CatIntersection( - new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "2"), - new CatWrapper(dictionary1.getSubjects().getSortedEntries(), "S1")) + new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2), + new CatWrapper(dictionary1.getSubjects().getSortedEntries(), SectionUtil.S1)) ); } CatUnion skipObject = new CatUnion(skipObjectsList); @@ -226,12 +240,12 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener int numCommonObjects = 0; ArrayList> commonObjectsList = new ArrayList<>(); - hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); - hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); + hmIterator1 = allObjects1.entrySet().iterator(); + hmIterator2 = allObjects2.entrySet().iterator(); boolean skip1 = false; boolean skip2 = false; - CharSequence dataType1 = CompactString.EMPTY; - CharSequence dataType2 = CompactString.EMPTY; + ByteString dataType1 = CompactString.EMPTY; + ByteString dataType2 = CompactString.EMPTY; DictionarySection section1 = null; DictionarySection section2 = null; while (hmIterator1.hasNext() || hmIterator2.hasNext()){ @@ -240,20 +254,20 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener if(!skip1) { Map.Entry entry1 = hmIterator1.next(); section1 = entry1.getValue(); - dataType1 = entry1.getKey(); + dataType1 = ByteString.of(entry1.getKey()); } } if(hmIterator2.hasNext()){ if(!skip2){ Map.Entry entry2 = hmIterator2.next(); section2 = entry2.getValue(); - dataType2 = entry2.getKey(); + dataType2 = ByteString.of(entry2.getKey()); } } if(section1 != null && section2 != null && dataType1.equals(dataType2)) { commonObjectsList.add(new CatIntersection( - new CatWrapper(section1.getSortedEntries(), dataType1 + "_1"), - new CatWrapper(section2.getSortedEntries(), dataType2 + "_2") + new CatWrapper(section1.getSortedEntries(), dataType1.copyAppend("_1")), + new CatWrapper(section2.getSortedEntries(), dataType2.copyAppend("_2")) )); }else{ int comp = comparator.compare(dataType1, dataType2); @@ -280,59 +294,60 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener skipObjectsList = new ArrayList<>(); - if(dictionary1.getAllObjects().containsKey(NO_DT_OBJECTS)) { + if(allObjects1.containsKey(NO_DT_OBJECTS)) { skipObjectsList.add(new CatIntersection( - new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "1"), - new CatWrapper(dictionary2.getShared().getSortedEntries(), "SH2")) + new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1), + new CatWrapper(dictionary2.getShared().getSortedEntries(), SectionUtil.SH2)) ); skipObjectsList.add(new CatIntersection( - new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "1"), - new CatWrapper(dictionary2.getSubjects().getSortedEntries(), "S2"))); + new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1), + new CatWrapper(dictionary2.getSubjects().getSortedEntries(), SectionUtil.S2))); } - if(dictionary2.getAllObjects().containsKey(NO_DT_OBJECTS)) { + if(allObjects2.containsKey(NO_DT_OBJECTS)) { skipObjectsList.add(new CatIntersection( - new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "2"), - new CatWrapper(dictionary1.getShared().getSortedEntries(), "SH1")) + new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2), + new CatWrapper(dictionary1.getShared().getSortedEntries(), SectionUtil.SH1)) ); skipObjectsList.add(new CatIntersection( - new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "2"), - new CatWrapper(dictionary1.getSubjects().getSortedEntries(), "S1")) + new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2), + new CatWrapper(dictionary1.getSubjects().getSortedEntries(), SectionUtil.S1)) ); } skipObject = new CatUnion(skipObjectsList); long numObject = dictionary1.getNAllObjects()+dictionary2.getNAllObjects()-numCommonObjects-numSkipObjects; - hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); - hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); + hmIterator1 = allObjects1.entrySet().iterator(); + hmIterator2 = allObjects2.entrySet().iterator(); int type = 4; - ArrayList dataTypes = new ArrayList<>(); + List dataTypes = new ArrayList<>(); // iterate over objects subsections and cat them together countSubSections1 = 0; countSubSections2 = 0; - HashMap offsets = new HashMap<>(); + Map offsets = new HashMap<>(); long total = 0; skip1 = false; skip2 = false; - dataType1 = CompactString.EMPTY; - dataType2 = CompactString.EMPTY; + dataType1 = ByteString.empty(); + dataType2 = ByteString.empty(); section1 = null; section2 = null; - String prefix1 = ""; - String prefix2= ""; + ByteString prefix1 = ByteString.empty(); + ByteString prefix2= ByteString.empty(); while (hmIterator1.hasNext() || hmIterator2.hasNext()){ - ArrayList> addObjectsList = new ArrayList<>(); - ArrayList> countObjectsList = new ArrayList<>(); + List> addObjectsList = new ArrayList<>(); + List> countObjectsList = new ArrayList<>(); if(hmIterator1.hasNext()){ if(!skip1) { Map.Entry entry = hmIterator1.next(); - dataType1 = entry.getKey(); + dataType1 = ByteString.of(entry.getKey()); section1 = entry.getValue(); - prefix1 = "sub" + countSubSections1; if (dataType1.equals(NO_DT_OBJECTS)) { - prefix1 = dataType1.toString(); + prefix1 = NO_DT_OBJECTS; + } else { + prefix1 = SectionUtil.createSub(countSubSections1); } countSubSections1++; } @@ -340,42 +355,43 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener if(hmIterator2.hasNext()){ if(!skip2) { Map.Entry entry = hmIterator2.next(); - dataType2 = entry.getKey(); + dataType2 = ByteString.of(entry.getKey()); section2 = entry.getValue(); - prefix2 = "sub" + countSubSections2; if (dataType2.equals(NO_DT_OBJECTS)) { - prefix2 = dataType2.toString(); + prefix2 = NO_DT_OBJECTS; + } else { + prefix2 = SectionUtil.createSub(countSubSections2); } countSubSections2++; } } - CharSequence dataType = CompactString.EMPTY; + ByteString dataType = CompactString.EMPTY; if(section1 != null && section2 != null && dataType1.equals(dataType2)){ dataType = dataType1; addObjectsList.add(new CatWrapper( section1.getSortedEntries(), - prefix1+"1") + prefix1.copyAppend("1")) ); countObjectsList.add(new CatWrapper( section1.getSortedEntries(), - prefix1+"1") + prefix1.copyAppend("1")) ); addObjectsList.add(new CatWrapper( section2.getSortedEntries(), - prefix2+"2") + prefix2.copyAppend("2")) ); countObjectsList.add(new CatWrapper( section2.getSortedEntries(), - prefix2+"2") + prefix2.copyAppend("2")) ); skip1 = false; skip2 = false; if(!hmIterator1.hasNext()){ section1 = null; - dataType1 = CompactString.EMPTY; + dataType1 = ByteString.empty(); }else if(!hmIterator2.hasNext()){ section2 = null; - dataType2 = CompactString.EMPTY; + dataType2 = ByteString.empty(); } }else{ boolean fromOne = false; @@ -396,15 +412,15 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener dataType = dataType1; addObjectsList.add(new CatWrapper( section1.getSortedEntries(), - prefix1+"1") + prefix1.copyAppend("1")) ); countObjectsList.add(new CatWrapper( section1.getSortedEntries(), - prefix1+"1") + prefix1.copyAppend("1")) ); if(!hmIterator1.hasNext()){ section1 = null; - dataType1 = ""; + dataType1 = ByteString.empty(); skip2 = false; }else { skip1 = false; @@ -414,15 +430,15 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener dataType = dataType2; addObjectsList.add(new CatWrapper( section2.getSortedEntries(), - prefix2+"2") + prefix2.copyAppend("2")) ); countObjectsList.add(new CatWrapper( section2.getSortedEntries(), - prefix2+"2") + prefix2.copyAppend("2")) ); if(!hmIterator2.hasNext()){ section2 = null; - dataType2 = CompactString.EMPTY; + dataType2 = ByteString.empty(); skip1 = false; }else { skip1 = true; @@ -459,23 +475,23 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } int numCommonS1O2 = 0; - if(dictionary2.getAllObjects().containsKey(NO_DT_OBJECTS)) { - CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(), "S1"), new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "2")); + if(allObjects2.containsKey(NO_DT_OBJECTS)) { + CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(), SectionUtil.S1), new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2)); while (i2.hasNext()) { i2.next(); numCommonS1O2++; } } int numCommonO1S2 = 0; - if(dictionary1.getAllObjects().containsKey(NO_DT_OBJECTS)) { - CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "1"), new CatWrapper(dictionary2.getSubjects().getSortedEntries(), "S2")); + if(allObjects1.containsKey(NO_DT_OBJECTS)) { + CatIntersection i2 = new CatIntersection(new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1), new CatWrapper(dictionary2.getSubjects().getSortedEntries(), SectionUtil.S2)); while (i2.hasNext()) { i2.next(); numCommonO1S2++; } } - CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"),new CatWrapper( dictionary2.getShared().getSortedEntries(),"SH2")); + CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1),new CatWrapper( dictionary2.getShared().getSortedEntries(),SectionUtil.SH2)); int numCommonSh1Sh2=0; while (i2.hasNext()){ i2.next(); @@ -484,19 +500,19 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener numShared = dictionary1.getShared().getNumberOfElements()+dictionary2.getShared().getNumberOfElements()-numCommonSh1Sh2+numCommonS1O2+numCommonO1S2; ArrayList> addSharedList = new ArrayList<>(); - addSharedList.add(new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1")); - addSharedList.add(new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2")); + addSharedList.add(new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1)); + addSharedList.add(new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2)); - if(dictionary1.getAllObjects().containsKey(NO_DT_OBJECTS)) { - addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "1"), new CatWrapper(dictionary2.getShared().getSortedEntries(), "SH2"))); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(), "S2"), new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "1"))); + if(allObjects1.containsKey(NO_DT_OBJECTS)) { + addSharedList.add(new CatIntersection(new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1), new CatWrapper(dictionary2.getShared().getSortedEntries(), SectionUtil.SH2))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(), SectionUtil.S2), new CatWrapper(allObjects1.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_1))); } - if(dictionary2.getAllObjects().containsKey(NO_DT_OBJECTS)) { - addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(), "S1"), new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "2"))); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "2"), new CatWrapper(dictionary1.getShared().getSortedEntries(), "SH1"))); + if(allObjects2.containsKey(NO_DT_OBJECTS)) { + addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(), SectionUtil.S1), new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2))); + addSharedList.add(new CatIntersection(new CatWrapper(allObjects2.get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS_2), new CatWrapper(dictionary1.getShared().getSortedEntries(), SectionUtil.SH1))); } - addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),"S1"),new CatWrapper(dictionary2.getShared().getSortedEntries(),"SH2"))); - addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2"),new CatWrapper(dictionary1.getShared().getSortedEntries(),"SH1"))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary1.getSubjects().getSortedEntries(),SectionUtil.S1),new CatWrapper(dictionary2.getShared().getSortedEntries(),SectionUtil.SH2))); + addSharedList.add(new CatIntersection(new CatWrapper(dictionary2.getSubjects().getSortedEntries(),SectionUtil.S2),new CatWrapper(dictionary1.getShared().getSortedEntries(),SectionUtil.SH1))); CatUnion itAddShared = new CatUnion(addSharedList); catSection(numShared, 1,itAddShared,new CatUnion(new ArrayList<>()) ,allMappings, iListener); @@ -515,7 +531,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener Files.delete(Path.of(location + "section" + i)); } VByte.encode(outFinal, dataTypes.size()); - for(CharSequence datatype:dataTypes){ + for(ByteString datatype : dataTypes){ String datatypeStr = datatype.toString(); byte[] bytes = datatypeStr.getBytes(); IOUtil.writeSizedBuffer(outFinal, bytes, 0, bytes.length, iListener); @@ -527,17 +543,19 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } // create the objects mappings long oldId = 0; - hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); + hmIterator1 = allObjects1.entrySet().iterator(); countSubSections1 = 0; countSubSections2 = 0; while (hmIterator1.hasNext()){ Map.Entry entry = hmIterator1.next(); - CharSequence dataType = entry.getKey(); - String prefix = "sub"+countSubSections1; - if(dataType.equals(NO_DT_OBJECTS)) - prefix = dataType+"1"; - else - prefix +="1"; + ByteString dataType = ByteString.of(entry.getKey()); + ByteString prefix; + if(dataType.equals(NO_DT_OBJECTS)) { + prefix = NO_DT_OBJECTS; + } else { + prefix =SectionUtil.createSub(countSubSections1); + } + prefix = prefix.copyAppend("1"); if(allMappings.containsKey(prefix)) { CatMapping mapping = allMappings.get(prefix); for (int i = 0; i < mapping.getSize(); i++) { @@ -545,7 +563,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener if (mapping.getType(i) != 1 && offsets.containsKey(dataType)) { newId = newId + offsets.get(dataType); } - allMappings.get("O1").set(oldId, newId, (int) mapping.getType(i)); + allMappings.get(SectionUtil.O1).set(oldId, newId, (int) mapping.getType(i)); oldId++; } } @@ -553,23 +571,28 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } oldId = 0; - hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); + hmIterator2 = allObjects2.entrySet().iterator(); while (hmIterator2.hasNext()){ Map.Entry entry = hmIterator2.next(); - CharSequence dataType = entry.getKey(); - String prefix = "sub"+countSubSections2; - if(dataType.equals(NO_DT_OBJECTS)) - prefix = dataType+"2"; - else - prefix +="2"; - if(allMappings.containsKey(prefix)) { - CatMapping mapping = allMappings.get(prefix); + ByteString dataType = ByteString.of(entry.getKey()); + ByteString prefix; + if(dataType.equals(NO_DT_OBJECTS)) { + prefix = NO_DT_OBJECTS; + } else { + prefix = SectionUtil.createSub(countSubSections2); + } + prefix = prefix.copyAppend("2"); + CatMapping mapping = allMappings.get(prefix); + if(mapping != null) { countSubSections2++; for (int i = 0; i < mapping.getSize(); i++) { long newId = mapping.getMapping(i); - if (mapping.getType(i) != 1 && offsets.containsKey(dataType)) - newId = newId + offsets.get(dataType); - allMappings.get("O2").set(oldId, newId, (int) mapping.getType(i)); + long mappingType = mapping.getType(i); + Long offset = offsets.get(dataType); + if (mappingType != 1 && offset != null) { + newId = newId + offset; + } + allMappings.get(SectionUtil.O2).set(oldId, newId, (int) mappingType); oldId++; } } @@ -577,27 +600,27 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener //calculate the inverse mapping for the subjects, i.e. from the new dictionary subject section to the old ones mappingS = new CatMappingBack(location,numSubjects+numShared); - for (int i=0; i mappings, ProgressListener listener) throws IOException { + private void catSection(long numEntries, int type, CatUnion itAdd , CatUnion itSkip , Map mappings, ProgressListener listener) throws IOException { long numberElements = 0; - String name; + ByteString name; switch (type) { case 2: - name = "subject"; + name = SectionUtil.SECTION_SUBJECT; break; case 3: - name = "object"; + name = SectionUtil.SECTION_OBJECT; break; case 4: - name = "predicate"; + name = SectionUtil.SECTION_PREDICATE; break; default: - name = ""; + name = CompactString.EMPTY; break; } long storedBuffersSize = 0; @@ -636,7 +659,7 @@ private void catSection(long numEntries, int type, CatUnion itAdd , CatUnion itS blocks = new SequenceLog64BigDisk(location+"SequenceLog64BigDisk"+type,64, numEntries/16); byteOut = new ByteArrayOutputStream(16*1024); if (numEntries > 0) { - CharSequence previousStr=null; + ByteString previousStr = null; CatElement skipElement = null; if(itSkip.hasNext()){ @@ -645,7 +668,7 @@ private void catSection(long numEntries, int type, CatUnion itAdd , CatUnion itS while (itAdd.hasNext()){ ListenerUtil.notifyCond(listener, "Analyze section "+name+" ", numberElements, numberElements, numEntries); CatElement nextElement = itAdd.next(); - if (skipElement!= null && nextElement.entity.toString().equals(skipElement.entity.toString())) { + if (skipElement!= null && nextElement.entity.equals(skipElement.entity)) { if(itSkip.hasNext()) skipElement = itSkip.next(); else @@ -653,11 +676,11 @@ private void catSection(long numEntries, int type, CatUnion itAdd , CatUnion itS } else { for (int i = 0; i < nextElement.IDs.size(); i++) { long id = nextElement.IDs.get(i).pos; - String iter = nextElement.IDs.get(i).iter.toString(); + ByteString iter = nextElement.IDs.get(i).iter; mappings.get(iter).set(id - 1, numberElements + 1, type); } - String str = nextElement.entity.toString(); + ByteString str = nextElement.entity; if (numberElements % DEFAULT_BLOCK_SIZE == 0) { blocks.append(storedBuffersSize + byteOut.size()); numBlocks++; @@ -724,7 +747,8 @@ public CatMappingBack getMappingS() { return mappingS; } - public HashMap getAllMappings() { + @Override + public Map getAllMappings() { return allMappings; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java index e6c23d59..7948ea60 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java @@ -17,6 +17,7 @@ import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.util.LiteralsUtils; import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.string.ByteString; import java.io.FileOutputStream; import java.io.IOException; @@ -32,7 +33,7 @@ public class MultipleSectionDictionaryDiff implements DictionaryDiff { private final String location; - private final Map allMappings = new HashMap<>(); + private final Map allMappings = new HashMap<>(); private CatMapping mappingBack; public long numShared; public MultipleSectionDictionaryDiff(String location){ @@ -50,20 +51,22 @@ public void close() throws IOException { } @Override public void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException { - allMappings.put("predicate",new CatMapping(location,"predicate",dictionary.getPredicates().getNumberOfElements())); - allMappings.put("subject",new CatMapping(location,"subject",dictionary.getSubjects().getNumberOfElements())); + allMappings.put(SectionUtil.SECTION_PREDICATE,new CatMapping(location,SectionUtil.SECTION_PREDICATE,dictionary.getPredicates().getNumberOfElements())); + allMappings.put(SectionUtil.SECTION_SUBJECT,new CatMapping(location,SectionUtil.SECTION_SUBJECT,dictionary.getSubjects().getNumberOfElements())); int countSubSection = 0; for (Map.Entry next : dictionary.getAllObjects().entrySet()) { - String subPrefix = "sub"+countSubSection; + ByteString subPrefix; if(next.getKey().equals(LiteralsUtils.NO_DATATYPE)){ - subPrefix = LiteralsUtils.NO_DATATYPE.toString(); + subPrefix = LiteralsUtils.NO_DATATYPE; + } else { + subPrefix = SectionUtil.createSub(countSubSection); } - allMappings.put(subPrefix,new CatMapping(location,subPrefix,next.getValue().getNumberOfElements())); + allMappings.put(subPrefix, new CatMapping(location,subPrefix,next.getValue().getNumberOfElements())); countSubSection++; } - allMappings.put("object",new CatMapping(location,"object",dictionary.getNAllObjects())); - allMappings.put("shared",new CatMapping(location,"shared",dictionary.getShared().getNumberOfElements())); + allMappings.put(SectionUtil.SECTION_OBJECT,new CatMapping(location,SectionUtil.SECTION_OBJECT,dictionary.getNAllObjects())); + allMappings.put(SectionUtil.SECTION_SHARED,new CatMapping(location,SectionUtil.SECTION_SHARED,dictionary.getShared().getNumberOfElements())); // allMappings.put("shared_o",new CatMapping(location,"shared_o",dictionary.getShared().getNumberOfElements())); // Predicates @@ -71,7 +74,7 @@ public void diff(Dictionary dictionary, Map bitm Iterator predicates = dictionary.getPredicates().getSortedEntries(); // CatWrapper itAddPreds = new CatWrapper(predicates,"predicate"); - DiffWrapper itSkipPreds = new DiffWrapper(predicates,predicatesBitMap,"predicate"); + DiffWrapper itSkipPreds = new DiffWrapper(predicates,predicatesBitMap,SectionUtil.SECTION_PREDICATE); // ArrayList> listAddPred = new ArrayList<>(); // listAddPred.add(itAddPreds); @@ -89,7 +92,7 @@ public void diff(Dictionary dictionary, Map bitm Iterator subjects = dictionary.getSubjects().getSortedEntries(); // CatWrapper itAddSubs = new CatWrapper(subjects,"subject"); - DiffWrapper itSkipSubs = new DiffWrapper(subjects,subjectsBitMap,"subject"); + DiffWrapper itSkipSubs = new DiffWrapper(subjects,subjectsBitMap,SectionUtil.SECTION_SUBJECT); // ArrayList> listAddSubj = new ArrayList<>(); // listAddSubj.add(itAddSubs); @@ -109,14 +112,15 @@ public void diff(Dictionary dictionary, Map bitm // Objects ----------------------------+++++++++++++++++++++++++++++++++---------------------------------------- - List dataTypes = new ArrayList<>(); - Map offsets = new HashMap<>(); + List dataTypes = new ArrayList<>(); + Map offsets = new HashMap<>(); int countSection = 0; long totalObjects = 0; for (Map.Entry next : dictionary.getAllObjects().entrySet()) { int type = 4 + dataTypes.size(); - if(next.getKey().equals(LiteralsUtils.NO_DATATYPE)){ + ByteString key = ByteString.of(next.getKey()); + if(key.equals(LiteralsUtils.NO_DATATYPE)){ long numNoDataType = createNoDataTypeSection(bitmaps, dictionary,totalObjects,type); if(numNoDataType > 0){ dataTypes.add(LiteralsUtils.NO_DATATYPE); @@ -124,10 +128,10 @@ public void diff(Dictionary dictionary, Map bitm totalObjects+= numNoDataType; } }else { - Bitmap objectsBitMap = bitmaps.get(next.getKey()); - Iterator objects = dictionary.getAllObjects().get(next.getKey()).getSortedEntries(); + Bitmap objectsBitMap = bitmaps.get(key); + Iterator objects = dictionary.getAllObjects().get(key).getSortedEntries(); - String subPrefix = "sub"+countSection; + ByteString subPrefix = SectionUtil.createSub(countSection); DiffWrapper itSkipObjs = new DiffWrapper(objects, objectsBitMap, subPrefix); ArrayList> listSkipObjs = new ArrayList<>(); @@ -136,8 +140,8 @@ public void diff(Dictionary dictionary, Map bitm long numObject = objectsBitMap.countOnes(); // append the data types of the new dictionary if the section still exists ( number of elts > 0 ) if (numObject > 0) { - dataTypes.add(next.getKey()); - offsets.put(next.getKey(),totalObjects); + dataTypes.add(key); + offsets.put(key,totalObjects); } totalObjects += numObject; SectionUtil.createSection(location, numObject, type, new CatUnion(listSkipObjs), new CatUnion(new ArrayList<>()), allMappings, 0,null); @@ -151,11 +155,11 @@ public void diff(Dictionary dictionary, Map bitm Iterator shared = dictionary.getShared().getSortedEntries(); - DiffWrapper sharedSubj = new DiffWrapper(shared, sharedSubjBitMap,"shared"); + DiffWrapper sharedSubj = new DiffWrapper(shared, sharedSubjBitMap,SectionUtil.SECTION_SHARED); shared = dictionary.getShared().getSortedEntries(); - DiffWrapper sharedObj = new DiffWrapper(shared,sharedObjBitMap,"shared"); + DiffWrapper sharedObj = new DiffWrapper(shared,sharedObjBitMap,SectionUtil.SECTION_SHARED); ArrayList> listShared = new ArrayList<>(); listShared.add(new CatIntersection(sharedSubj,sharedObj)); @@ -168,9 +172,9 @@ public void diff(Dictionary dictionary, Map bitm } listShared = new ArrayList<>(); - sharedSubj = new DiffWrapper(dictionary.getShared().getSortedEntries(), sharedSubjBitMap,"shared"); + sharedSubj = new DiffWrapper(dictionary.getShared().getSortedEntries(), sharedSubjBitMap,SectionUtil.SECTION_SHARED); - sharedObj = new DiffWrapper(dictionary.getShared().getSortedEntries(), sharedObjBitMap,"shared"); + sharedObj = new DiffWrapper(dictionary.getShared().getSortedEntries(), sharedObjBitMap,SectionUtil.SECTION_SHARED); listShared.add(new CatIntersection(sharedSubj,sharedObj)); SectionUtil.createSection(location,numShared,1,new CatUnion(listShared),new CatUnion(new ArrayList<>()),allMappings,0,listener); @@ -190,10 +194,8 @@ public void diff(Dictionary dictionary, Map bitm Files.delete(Path.of(location + "section" + i)); } VByte.encode(out, dataTypes.size()); - for(CharSequence datatype:dataTypes){ - String datatypeStr = datatype.toString(); - byte[] bytes = datatypeStr.getBytes(); - IOUtil.writeSizedBuffer(out, bytes, 0, bytes.length, listener); + for (ByteString datatype : dataTypes) { + IOUtil.writeSizedBuffer(out, datatype.getBuffer(), listener); } for (int i = 0; i < dataTypes.size(); i++) { Files.copy(Path.of(location + "section" + (4 + i)), out); @@ -204,42 +206,46 @@ public void diff(Dictionary dictionary, Map bitm long oldId = 0; countSection = 0; for (CharSequence dataType : dictionary.getAllObjects().keySet()) { - String subPrefix = "sub"+countSection; - if(dataType.equals(LiteralsUtils.NO_DATATYPE)) { - subPrefix = dataType.toString(); + ByteString subPrefix; + ByteString dataTypeB = ByteString.of(dataType); + if(dataTypeB.equals(LiteralsUtils.NO_DATATYPE)) { + subPrefix = LiteralsUtils.NO_DATATYPE; + } else { + subPrefix = SectionUtil.createSub(countSection); } - if(allMappings.containsKey(subPrefix)){ + if (allMappings.containsKey(subPrefix)) { CatMapping mapping = allMappings.get(subPrefix); for (int i = 0; i < mapping.getSize(); i++) { long newId = mapping.getMapping(i); - if (mapping.getType(i) != 1 && offsets.containsKey(dataType)) { - newId = newId + offsets.get(dataType); + Long offset; + if (mapping.getType(i) != 1 && (offset = offsets.get(dataTypeB)) != null) { + newId = newId + offset; } - allMappings.get("object").set(oldId, newId, (int) mapping.getType(i)); + allMappings.get(SectionUtil.SECTION_OBJECT).set(oldId, newId, (int) mapping.getType(i)); oldId++; } } countSection++; } - mappingBack = new CatMapping(location,"back",numSubj+numShared); + mappingBack = new CatMapping(location,SectionUtil.BACK,numSubj+numShared); if(mappingBack.getSize() > 0 ) { - for (int i = 0; i < allMappings.get("shared").getSize(); i++) { - long type = allMappings.get("shared").getType(i); + for (int i = 0; i < allMappings.get(SectionUtil.SECTION_SHARED).getSize(); i++) { + long type = allMappings.get(SectionUtil.SECTION_SHARED).getType(i); if (type == 1) { - mappingBack.set(allMappings.get("shared").getMapping(i) - 1, i + 1, 1); + mappingBack.set(allMappings.get(SectionUtil.SECTION_SHARED).getMapping(i) - 1, i + 1, 1); } else if(type == 2){ - mappingBack.set(allMappings.get("shared").getMapping(i) + numShared - 1, i + 1, 2); + mappingBack.set(allMappings.get(SectionUtil.SECTION_SHARED).getMapping(i) + numShared - 1, i + 1, 2); } } - for (int i = 0; i < allMappings.get("subject").getSize(); i++) { - long type = allMappings.get("subject").getType(i); + for (int i = 0; i < allMappings.get(SectionUtil.SECTION_SUBJECT).getSize(); i++) { + long type = allMappings.get(SectionUtil.SECTION_SUBJECT).getType(i); if ( type == 1) { - mappingBack.set(allMappings.get("subject").getMapping(i) - 1, (i + 1 + (int) dictionary.getNshared()), 1); + mappingBack.set(allMappings.get(SectionUtil.SECTION_SUBJECT).getMapping(i) - 1, (i + 1 + (int) dictionary.getNshared()), 1); } else if(type == 2){ - mappingBack.set(allMappings.get("subject").getMapping(i) + numShared - 1, (i + 1 + (int) dictionary.getNshared()), 2); + mappingBack.set(allMappings.get(SectionUtil.SECTION_SUBJECT).getMapping(i) + numShared - 1, (i + 1 + (int) dictionary.getNshared()), 2); } } } @@ -249,7 +255,7 @@ private long createNoDataTypeSection(Map bitmaps Bitmap objectsBitMap = bitmaps.get(LiteralsUtils.NO_DATATYPE); Iterator objects = dictionary.getAllObjects().get(LiteralsUtils.NO_DATATYPE).getSortedEntries(); - DiffWrapper itSkipObjs = new DiffWrapper(objects, objectsBitMap,LiteralsUtils.NO_DATATYPE); + DiffWrapper itSkipObjs = new DiffWrapper(objects, objectsBitMap, LiteralsUtils.NO_DATATYPE); ArrayList> listSkipObjs = new ArrayList<>(); listSkipObjs.add(itSkipObjs); @@ -283,11 +289,11 @@ public SharedWrapper(int flag, Bitmap bitmapSub, Bitmap bitmapObj, Iterator IDs = new ArrayList<>(); - IDs.add(new CatElement.IteratorPlusPosition("shared",count+1)); + IDs.add(new CatElement.IteratorPlusPosition(SectionUtil.SECTION_SHARED,count+1)); next = new CatElement(element,IDs); count++; return true; @@ -312,7 +318,8 @@ public int count() { return i; } } - public Map getAllMappings() { + @Override + public Map getAllMappings() { return allMappings; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java index 293bb853..85cdc185 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java @@ -18,6 +18,7 @@ import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; import org.rdfhdt.hdt.util.listener.ListenerUtil; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CharSequenceComparator; @@ -53,7 +54,7 @@ public long getNAllObjects() { return objects.values().stream().mapToLong(DictionarySectionPrivate::getNumberOfElements).sum(); } - private ExceptionThread fillSection(Iterator objects, ProgressListener listener) throws InterruptedException { + private ExceptionThread fillSection(Iterator objects, ProgressListener listener) { PipedCopyIterator noDatatypeIterator = new PipedCopyIterator<>(); PipedCopyIterator datatypeIterator = new PipedCopyIterator<>(); String name = filename.getFileName().toString(); @@ -151,11 +152,11 @@ public void save(OutputStream output, ControlInfo ci, ProgressListener listener) VByte.encode(output, objects.size()); - for (Map.Entry entry : objects.entrySet()) { + for (Map.Entry entry : objects.entrySet()) { IOUtil.writeSizedBuffer(output, entry.getKey().toString().getBytes(ByteStringUtil.STRING_ENCODING), listener); } - for (Map.Entry entry : objects.entrySet()) { + for (Map.Entry entry : objects.entrySet()) { entry.getValue().save(output, iListener); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java index faaa4491..048b4ca5 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java @@ -40,6 +40,7 @@ import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CharSequenceCustomComparator; @@ -52,12 +53,12 @@ public class HashDictionarySection implements TempDictionarySection { public static final int TYPE_INDEX = 1; - private HashMap map; - private List list; + private Map map; + private List list; private int size; public boolean sorted; final boolean isCustom; - private final Map literalsCounts = new HashMap<>(); + private final Map literalsCounts = new HashMap<>(); /** * */ @@ -87,7 +88,7 @@ public long locate(CharSequence s) { * @see hdt.dictionary.DictionarySection#extract(int) */ @Override - public CharSequence extract(long pos) { + public ByteString extract(long pos) { if(pos<=0) { return null; } @@ -128,7 +129,7 @@ public Iterator getEntries() { @Override public long add(CharSequence entry) { - CharSequence compact = new CompactString(entry); + ByteString compact = new CompactString(entry); return map.computeIfAbsent(compact, key -> { // Not found, insert new list.add(compact); @@ -137,7 +138,7 @@ public long add(CharSequence entry) { // custom for subsection literals .. if (isCustom) { - CharSequence type = LiteralsUtils.getType(compact); + ByteString type = ByteString.of(LiteralsUtils.getType(compact)); // check if the entry doesn't already exist literalsCounts.compute(type, (key2, count) -> count == null ? 1L : count + 1L); } @@ -147,7 +148,7 @@ public long add(CharSequence entry) { @Override public void remove(CharSequence seq) { - map.remove(seq); + map.remove(ByteString.of(seq)); sorted = false; } @@ -192,7 +193,7 @@ public void close() throws IOException { } @Override - public Map getLiteralsCounts() { + public Map getLiteralsCounts() { return literalsCounts; } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java index f28e97f0..b5db6936 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java @@ -2,6 +2,7 @@ import org.rdfhdt.hdt.dictionary.TempDictionarySection; import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.util.string.ByteString; import java.io.IOException; import java.util.Iterator; @@ -23,11 +24,6 @@ public OneReadDictionarySection(Iterator reader, long si this.size = size; } - @Override - public Map getLiteralsCounts() { - return TempDictionarySection.super.getLiteralsCounts(); - } - @Override public long add(CharSequence str) { throw new NotImplementedException(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java index ef4e88fa..71f8969b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java @@ -48,6 +48,7 @@ import org.rdfhdt.hdt.util.crc.CRCInputStream; import org.rdfhdt.hdt.util.crc.CRCOutputStream; import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.ReplazableString; @@ -106,7 +107,7 @@ public void load(Iterator it, long numentries, ProgressL try { while(it.hasNext()) { - CharSequence str = it.next(); + ByteString str = (ByteString) it.next(); if(numstrings%blocksize==0) { // Add new block pointer @@ -144,7 +145,7 @@ public void load(Iterator it, long numentries, ProgressL } } - protected int locateBlock(CharSequence str) { + protected int locateBlock(ByteString str) { if(blocks.getNumberOfElements()==0) { return -1; } @@ -185,20 +186,23 @@ public long locate(CharSequence str) { if(text==null || blocks==null) { return 0; } + + // convert into bytestring to avoid bad comparison + ByteString bstr = ByteString.of(str); - int blocknum = locateBlock(str); + int blocknum = locateBlock(bstr); if(blocknum>=0) { // Located exactly - return (blocknum*blocksize)+1; + return ((long) blocknum * blocksize) + 1; } else { // Not located exactly. blocknum = -blocknum-2; if(blocknum>=0) { - int idblock = locateInBlock(blocknum, str); + int idblock = locateInBlock(blocknum, bstr); if(idblock != 0) { - return (blocknum*blocksize)+idblock+1; + return ((long) blocknum * blocksize) + idblock + 1; } } } @@ -206,7 +210,7 @@ public long locate(CharSequence str) { return 0; } - public int locateInBlock(int block, CharSequence str) { + public int locateInBlock(int block, ByteString str) { if(block>=blocks.getNumberOfElements()) { return 0; } @@ -354,7 +358,7 @@ public long getNumberOfElements() { */ @Override public Iterator getSortedEntries() { - return new Iterator() { + return new Iterator<>() { int id; int pos; final Mutable delta = new Mutable<>(0L); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java index 0cc299eb..63f693a0 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java @@ -52,6 +52,7 @@ import org.rdfhdt.hdt.util.crc.CRCInputStream; import org.rdfhdt.hdt.util.crc.CRCOutputStream; import org.rdfhdt.hdt.util.io.BigByteBuffer; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.ReplazableString; @@ -120,11 +121,11 @@ public void load(Iterator it, long numentries, ProgressL long byteoutsize = 0; ByteArrayOutputStream byteOut = new ByteArrayOutputStream(16*1024); - CharSequence previousStr=null; + ByteString previousStr=null; try { while(it.hasNext()) { - CharSequence str = it.next(); + ByteString str = ByteString.of(it.next()); if(numstrings%blocksize==0) { // Add new block pointer @@ -227,7 +228,7 @@ public void load(Iterator it, long numentries, ProgressL /** * Locate the block of a string doing binary search. */ - protected long locateBlock(CharSequence str) { + protected long locateBlock(ByteString str) { long low = 0; long high = blocks.getNumberOfElements() - 1; long max = high; @@ -259,8 +260,8 @@ protected long locateBlock(CharSequence str) { */ @Override public long locate(CharSequence str) { - - long blocknum = locateBlock(str); + ByteString bstr = ByteString.of(str); + long blocknum = locateBlock(bstr); if(blocknum>=0) { // Located exactly return (blocknum*blocksize)+1; @@ -269,7 +270,7 @@ public long locate(CharSequence str) { blocknum = -blocknum-2; if(blocknum>=0) { - long idblock = locateInBlock(blocknum, str); + long idblock = locateInBlock(blocknum, bstr); if(idblock != 0) { return (blocknum*blocksize)+idblock+1; @@ -281,7 +282,7 @@ public long locate(CharSequence str) { return 0; } - protected long locateInBlock(long blocknum, CharSequence str) { + protected long locateInBlock(long blocknum, ByteString str) { ReplazableString tempString = new ReplazableString(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionMap.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionMap.java index 59a6d4c1..493b8b42 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionMap.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionMap.java @@ -27,21 +27,6 @@ package org.rdfhdt.hdt.dictionary.impl.section; -import java.io.BufferedInputStream; -import java.io.Closeable; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.nio.channels.FileChannel.MapMode; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; - import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.compact.sequence.Sequence; import org.rdfhdt.hdt.compact.sequence.SequenceFactory; @@ -56,12 +41,27 @@ import org.rdfhdt.hdt.util.io.BigMappedByteBuffer; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.ReplazableString; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedInputStream; +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.Iterator; + /** * @author mario.arias * @author Dennis Diefenbach @@ -139,7 +139,7 @@ public PFCDictionarySectionMap(CountInputStream input, File f) throws IOExceptio } } - private long locateBlock(CharSequence str) { + private long locateBlock(ByteString str) { if(blocks.getNumberOfElements()==0) { return -1; } @@ -175,11 +175,12 @@ private long locateBlock(CharSequence str) { */ @Override public long locate(CharSequence str) { + ByteString bstr = ByteString.of(str); if(buffers==null || blocks==null) { return 0; } - long blocknum = locateBlock(str); + long blocknum = locateBlock(bstr); if(blocknum>=0) { // Located exactly return (blocknum*blocksize)+1; @@ -188,7 +189,7 @@ public long locate(CharSequence str) { blocknum = -blocknum-2; if(blocknum>=0) { - long idblock = locateInBlock(blocknum, str); + long idblock = locateInBlock(blocknum, bstr); if(idblock != 0) { return (blocknum*blocksize)+idblock+1; @@ -199,7 +200,7 @@ public long locate(CharSequence str) { return 0; } - protected long locateInBlock(long block, CharSequence str) { + protected long locateInBlock(long block, ByteString str) { if(block>=blocks.getNumberOfElements()) { return 0; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatElement.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatElement.java index 9dd73bd5..baf528af 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatElement.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatElement.java @@ -1,25 +1,24 @@ package org.rdfhdt.hdt.dictionary.impl.utilCat; +import org.rdfhdt.hdt.util.string.ByteString; + import java.util.ArrayList; +import java.util.List; public class CatElement { - public CharSequence entity; - public ArrayList IDs; - public CatElement(CharSequence entity, ArrayList IDs){ + public ByteString entity; + public List IDs; + public CatElement(ByteString entity, List IDs) { this.entity = entity; this.IDs = new ArrayList<>(IDs); } - public static class IteratorPlusPosition{ - public CharSequence iter; + public static class IteratorPlusPosition { + public ByteString iter; public long pos; - public IteratorPlusPosition(CharSequence iter,long pos){ + public IteratorPlusPosition(ByteString iter, long pos){ this.iter = iter; this.pos = pos; } } - @Override - public boolean equals(Object o) { - return entity.toString().equals(((CatElement) o).entity.toString()); - } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatIntersection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatIntersection.java index 501b81eb..93378e08 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatIntersection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatIntersection.java @@ -20,10 +20,8 @@ package org.rdfhdt.hdt.dictionary.impl.utilCat; import org.rdfhdt.hdt.exceptions.NotImplementedException; -import org.rdfhdt.hdt.util.string.CompactString; import java.util.ArrayList; -import java.util.Collections; import java.util.Iterator; public class CatIntersection implements Iterator { @@ -62,11 +60,11 @@ public CatElement next() { private void helpNext(){ while (list.size() != 0) { - list.sort(new IteratorPlusElementComparator()); + list.sort(IteratorPlusElement::compareTo); if (list.size() == 2) { - if (new CompactString(list.get(0).element.entity).equals(new CompactString(list.get(1).element.entity))) { + if (list.get(0).element.entity.equals(list.get(1).element.entity)) { hasNext = true; ArrayList ids = new ArrayList<>(); ids.addAll(list.get(0).element.IDs); @@ -113,4 +111,4 @@ private void helpNext(){ public void remove() { throw new NotImplementedException(); } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatMapping.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatMapping.java index 52bc771a..df09fe7a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatMapping.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatMapping.java @@ -22,6 +22,7 @@ import org.rdfhdt.hdt.util.disk.LongArrayDisk; import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.string.ByteString; import java.io.Closeable; import java.io.IOException; @@ -31,7 +32,7 @@ public class CatMapping implements Closeable { private final LongArrayDisk mappingType; private final long size; - public CatMapping(String location, String section, long size){ + public CatMapping(String location, ByteString section, long size){ this.size = size; this.mapping = new LongArrayDisk(location+section,size); this.mappingType = new LongArrayDisk(location+section+"Types",size); @@ -57,4 +58,4 @@ public long getSize(){ public void close() throws IOException { IOUtil.closeAll(mapping, mappingType); } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatUnion.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatUnion.java index e11ebc63..f309c69b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatUnion.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatUnion.java @@ -1,6 +1,7 @@ package org.rdfhdt.hdt.dictionary.impl.utilCat; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CompactString; import java.util.ArrayList; @@ -13,7 +14,7 @@ * */ public class CatUnion implements Iterator { - ArrayList list; + List list; private final List> listIters; @@ -39,14 +40,13 @@ public boolean hasNext() { @Override public CatElement next() { - ArrayList ids = new ArrayList<>(); - list.sort(new IteratorPlusElementComparator()); - CharSequence element = list.get(0).element.entity; - CompactString elementCompactString = new CompactString(element); + List ids = new ArrayList<>(); + list.sort(IteratorPlusElement::compareTo); + ByteString element = list.get(0).element.entity; ListIterator iteratorPlusElementIterator = list.listIterator(); while (iteratorPlusElementIterator.hasNext()) { IteratorPlusElement next = iteratorPlusElementIterator.next(); - if(elementCompactString.equals(new CompactString(next.element.entity))) { + if (element.equals(next.element.entity)) { int iter = next.iter; ids.addAll(next.element.IDs); if (listIters.get(iter).hasNext()) { @@ -55,10 +55,10 @@ public CatElement next() { } else { iteratorPlusElementIterator.remove(); } - } else{ + } else { break; } } - return new CatElement(element,ids); + return new CatElement(element, ids); } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatWrapper.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatWrapper.java index 0405f0e0..a6db3db2 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatWrapper.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatWrapper.java @@ -1,13 +1,16 @@ package org.rdfhdt.hdt.dictionary.impl.utilCat; +import org.rdfhdt.hdt.util.string.ByteString; + import java.util.ArrayList; import java.util.Iterator; +import java.util.List; public class CatWrapper implements Iterator { public Iterator sectionIter; - public String iterName; + public ByteString iterName; int count = 0; - public CatWrapper(Iterator sectionIter,String iterName){ + public CatWrapper(Iterator sectionIter, ByteString iterName){ this.sectionIter = sectionIter; this.iterName = iterName; } @@ -19,9 +22,9 @@ public boolean hasNext() { @Override public CatElement next() { - CharSequence entity = sectionIter.next(); + ByteString entity = ByteString.of(sectionIter.next()); count++; - ArrayList IDs = new ArrayList<>(); + List IDs = new ArrayList<>(); IDs.add(new CatElement.IteratorPlusPosition(iterName,count)); return new CatElement(entity,IDs); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/IteratorPlusElement.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/IteratorPlusElement.java index 8992da7a..716e1b29 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/IteratorPlusElement.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/IteratorPlusElement.java @@ -1,10 +1,15 @@ package org.rdfhdt.hdt.dictionary.impl.utilCat; -public class IteratorPlusElement { +public class IteratorPlusElement implements Comparable{ int iter; CatElement element; IteratorPlusElement(int iter, CatElement element){ this.iter = iter; this.element = element; } -} \ No newline at end of file + + @Override + public int compareTo(IteratorPlusElement o) { + return element.entity.compareTo(o.element.entity); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/IteratorPlusElementComparator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/IteratorPlusElementComparator.java deleted file mode 100644 index 0cff1fc0..00000000 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/IteratorPlusElementComparator.java +++ /dev/null @@ -1,15 +0,0 @@ -package org.rdfhdt.hdt.dictionary.impl.utilCat; - -import org.rdfhdt.hdt.util.string.CharSequenceComparator; -import org.rdfhdt.hdt.util.string.CompactString; - -import java.util.Comparator; - -public class IteratorPlusElementComparator implements Comparator { - - public int compare(IteratorPlusElement a, IteratorPlusElement b) { - CharSequenceComparator comparator = new CharSequenceComparator(); - - return comparator.compare(new CompactString(a.element.entity),new CompactString(b.element.entity)); - } -} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java index e243cd89..8490a6a4 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java @@ -8,6 +8,7 @@ import org.rdfhdt.hdt.util.crc.CRCOutputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.ListenerUtil; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import java.io.ByteArrayOutputStream; @@ -20,21 +21,54 @@ public class SectionUtil { + public static final ByteString S1 = ByteString.of("S1"); + public static final ByteString S2 = ByteString.of("S2"); + public static final ByteString P1 = ByteString.of("P1"); + public static final ByteString P2 = ByteString.of("P2"); + public static final ByteString O1 = ByteString.of("O1"); + public static final ByteString O2 = ByteString.of("O2"); + public static final ByteString SH1 = ByteString.of("SH1"); + public static final ByteString SH2 = ByteString.of("SH2"); + + public static final ByteString SECTION = ByteString.of("section"); + public static final ByteString SECTION_SUBJECT = ByteString.of("subject"); + public static final ByteString SECTION_PREDICATE = ByteString.of("predicate"); + public static final ByteString SECTION_OBJECT = ByteString.of("object"); + public static final ByteString SECTION_SHARED = ByteString.of("shared"); + public static final ByteString BACK = ByteString.of("back"); + private static final ByteString SUB_PREFIX = ByteString.of("sub"); + private static final int DEFAULT_BLOCK_SIZE = 16; private static final int BLOCK_PER_BUFFER = 1000000; + public static ByteString createSub(Object next) { + return createSub(String.valueOf(next)); + } + + public static ByteString createSub(CharSequence next) { + return createSub(ByteString.of(next)); + } + + public static ByteString createSub(ByteString next) { + return SUB_PREFIX.copyAppend(next); + } + public static void createSection(String location, long numEntries, int type, CatUnion itAdd , CatUnion itSkip , Map mappings, long offset, ProgressListener listener) throws IOException { - String name = ""; + ByteString name; switch (type) { case 2: - name = "subject"; + name = SECTION_SUBJECT; break; case 3: - name = "object"; + name = SECTION_OBJECT; break; case 4: - name = "predicate"; + name = SECTION_PREDICATE; + break; + default: + name = ByteString.empty(); + break; } long storedBuffersSize = 0; long numBlocks = 0; @@ -45,7 +79,7 @@ public static void createSection(String location, long numEntries, int type, Cat blocks = new SequenceLog64BigDisk(location + "SequenceLog64BigDisk" + type, 64, numEntries / 16); byteOut = new ByteArrayOutputStream(16 * 1024); if (numEntries > 0) { - CharSequence previousStr = null; + ByteString previousStr = null; CatElement skipElement = null; if (itSkip.hasNext()) { @@ -55,7 +89,7 @@ public static void createSection(String location, long numEntries, int type, Cat ListenerUtil.notifyCond(listener, "Analyze section " + name + " ", numberElements, numberElements, numEntries); CatElement nextElement = itAdd.next(); - if (skipElement != null && nextElement.entity.toString().equals(skipElement.entity.toString())) { + if (skipElement != null && nextElement.entity.equals(skipElement.entity)) { if (itSkip.hasNext()) { skipElement = itSkip.next(); } else { @@ -64,13 +98,13 @@ public static void createSection(String location, long numEntries, int type, Cat } else { for (int i = 0; i < nextElement.IDs.size(); i++) { long id = nextElement.IDs.get(i).pos; - String iter = nextElement.IDs.get(i).iter.toString(); - if (iter.equals("shared")) + ByteString iter = nextElement.IDs.get(i).iter; + if (iter.equals(SECTION_SHARED)) mappings.get(iter).set(id - 1, offset + numberElements + 1, type); else mappings.get(iter).set(id - 1, numberElements + 1, type); } - String str = nextElement.entity.toString(); + ByteString str = nextElement.entity; if (numberElements % DEFAULT_BLOCK_SIZE == 0) { blocks.append(storedBuffersSize + byteOut.size()); numBlocks++; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilDiff/DiffWrapper.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilDiff/DiffWrapper.java index be58746e..0e311ecb 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilDiff/DiffWrapper.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilDiff/DiffWrapper.java @@ -2,9 +2,11 @@ import org.rdfhdt.hdt.compact.bitmap.Bitmap; import org.rdfhdt.hdt.dictionary.impl.utilCat.CatElement; +import org.rdfhdt.hdt.util.string.ByteString; import java.util.ArrayList; import java.util.Iterator; +import java.util.List; /** * Iterator keeping only the element marked with a true in a bitmap @@ -13,7 +15,7 @@ public class DiffWrapper implements Iterator { public final Iterator sectionIter; public final Bitmap bitmap; - public final CharSequence iterName; + public final ByteString iterName; /** * create a diffWrapper of the iterator sectionIter with the bitmap bitmap @@ -22,7 +24,7 @@ public class DiffWrapper implements Iterator { * @param bitmap the bitmap to tell which element to keep * @param iterName the name of the section of the iterator */ - public DiffWrapper(Iterator sectionIter, Bitmap bitmap, CharSequence iterName) { + public DiffWrapper(Iterator sectionIter, Bitmap bitmap, ByteString iterName) { this.sectionIter = sectionIter; this.bitmap = bitmap; this.iterName = iterName; @@ -34,10 +36,10 @@ public DiffWrapper(Iterator sectionIter, Bitmap bitmap, @Override public boolean hasNext() { while (sectionIter.hasNext()) { - CharSequence element = sectionIter.next(); + ByteString element = ByteString.of(sectionIter.next()); if (bitmap.access(count)) { // we need to keep this element - ArrayList IDs = new ArrayList<>(); + List IDs = new ArrayList<>(); IDs.add(new CatElement.IteratorPlusPosition(iterName, count + 1)); next = new CatElement(element, IDs); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 7b75763b..9e0631fb 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -411,9 +411,10 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< // generate the hdt gen++; profiler.pushSection("generateHDT #" + gen); - ProgressListener il = PrefixListener.of("gen#" + gen, listener); + PrefixListener il = PrefixListener.of("gen#" + gen, listener); Path hdtLocation = hdtStore.resolve("hdt-" + gen + ".hdt"); supplier.doGenerateHDT(it, baseURI, hdtFormat, il, hdtLocation); + il.clearThreads(); nextFile = it.hasNextFlux(); HDTFile hdtFile = new HDTFile(hdtLocation, 1); @@ -424,15 +425,16 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< HDTFile lastHDTFile = files.remove(files.size() - 1); cat++; profiler.pushSection("catHDT #" + cat); - ProgressListener ilc = PrefixListener.of("cat#" + cat, listener); + PrefixListener ilc = PrefixListener.of("cat#" + cat, listener); Path hdtCatFileLocation = hdtStore.resolve("hdtcat-" + cat + ".hdt"); try (HDT abcat = HDTManager.catHDT( hdtCatLocation, lastHDTFile.getHdtFile().toAbsolutePath().toString(), hdtFile.getHdtFile().toAbsolutePath().toString(), hdtFormat, ilc)) { - abcat.saveToHDT(hdtCatFileLocation.toAbsolutePath().toString(), il); + abcat.saveToHDT(hdtCatFileLocation.toAbsolutePath().toString(), ilc); } + ilc.clearThreads(); // delete previous chunks Files.delete(lastHDTFile.getHdtFile()); Files.delete(hdtFile.getHdtFile()); @@ -444,6 +446,8 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< files.add(hdtFile); } while (nextFile); + listener.notifyProgress(100, "done, loading HDT"); + Path hdtFile = files.get(0).hdtFile; assert files.get(0).getChunks() == gen; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java index c9f851ae..5626cb6d 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java @@ -18,6 +18,8 @@ import org.rdfhdt.hdt.util.io.compress.CompressNodeReader; import org.rdfhdt.hdt.util.io.compress.CompressUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.string.ByteString; +import org.rdfhdt.hdt.util.string.CompactString; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -70,8 +72,8 @@ public SectionCompressor(CloseSuppressPath baseFileName, AsyncIteratorFetcher iterator type + * @author Antoine Willerval + */ +public class CombinedIterator implements Iterator { + /** + * combine multiple iterators + * + * @param iterators iterators + * @param iterator type + * @return iterator + * @throws java.lang.NullPointerException if iterators is null + */ + public static Iterator combine(List> iterators) { + Objects.requireNonNull(iterators, "iterators can't be null"); + return combine(iterators, 0, iterators.size()); + } + + private static Iterator combine(List> iterators, int start, int end) { + int len = end - start; + + if (len <= 0) { + return new Iterator<>() { + @Override + public boolean hasNext() { + return false; + } + + @Override + public T next() { + return null; + } + }; + } + if (len == 1) { + return iterators.get(start); + } + + // use a tree to reduce the effect of big combine + int mid = (end + start) / 2; + return new CombinedIterator<>( + combine(iterators, start, mid), + combine(iterators, mid, end) + ); + } + + private final Iterator left; + private final Iterator right; + private T next; + + private CombinedIterator(Iterator left, Iterator right) { + this.left = left; + this.right = right; + } + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + + if (left.hasNext()) { + next = left.next(); + } else if (right.hasNext()) { + next = right.next(); + } else { + return false; + } + return true; + } + + @Override + public T next() { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java index 249a8e73..9b984f0b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java @@ -1,6 +1,7 @@ package org.rdfhdt.hdt.iterator.utils; import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import java.io.IOException; @@ -18,7 +19,7 @@ public class FileTripleIterator extends FileChunkIterator { public static long estimateSize(TripleString tripleString) { try { - return tripleString.asNtriple().toString().getBytes(ByteStringUtil.STRING_ENCODING).length; + return ByteString.of(tripleString.asNtriple()).getBuffer().length; } catch (IOException e) { throw new RuntimeException("Can't estimate the size of the triple " + tripleString, e); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java index dd372185..6913699e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java @@ -45,7 +45,7 @@ public boolean hasNext() throws E { public T next() throws E { current++; if (current % (size / split) == 0) { - listener.notifyProgress((float) current / size, message + " " + current + "/" + size); + listener.notifyProgress(100f * current / size, message + " " + current + "/" + size); } return it.next(); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java index 394ffcfe..a76c6d9d 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java @@ -1,23 +1,27 @@ package org.rdfhdt.hdt.triples; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import java.util.Comparator; -public class IndexedNode implements Comparable { - private static final Comparator NODE_COMPARATOR = CharSequenceComparator.getInstance(); - private CharSequence node; +public final class IndexedNode implements Comparable { + private ByteString node; private long index; - public IndexedNode(CharSequence node, long index) { + public IndexedNode(ByteString node, long index) { this.node = node; this.index = index; } + + public IndexedNode(CharSequence node, long index) { + this(ByteString.of(node), index); + } public IndexedNode() { } - public CharSequence getNode() { + public ByteString getNode() { return node; } @@ -29,12 +33,12 @@ public void setIndex(long index) { this.index = index; } - public void setNode(CharSequence node) { + public void setNode(ByteString node) { this.node = node; } @Override public int compareTo(IndexedNode o) { - return NODE_COMPARATOR.compare(node, o.getNode()); + return node.compareTo(o.node); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorCat.java index fe8758ac..ef2524f2 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorCat.java @@ -20,6 +20,7 @@ package org.rdfhdt.hdt.triples.impl; import org.rdfhdt.hdt.dictionary.DictionaryCat; +import org.rdfhdt.hdt.dictionary.impl.utilCat.SectionUtil; import org.rdfhdt.hdt.enums.ResultEstimationType; import org.rdfhdt.hdt.enums.TripleComponentOrder; import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMapping; @@ -120,14 +121,11 @@ public boolean hasNext() { @Override public TripleID next() { - if (list.hasNext()){ - return list.next(); - } else { - + if (!list.hasNext()) { list = getTripleID(count).listIterator(); - count ++; - return list.next(); + count++; } + return list.next(); } @Override @@ -137,10 +135,8 @@ public void remove() { private List getTripleID(int count){ Set set = new HashSet<>(); - ArrayList mapping = null; - ArrayList mappingType = null; - mapping = dictionaryCat.getMappingS().getMapping(count); - mappingType = dictionaryCat.getMappingS().getType(count); + List mapping = dictionaryCat.getMappingS().getMapping(count); + List mappingType = dictionaryCat.getMappingS().getType(count); for (int i = 0; i getTripleID(int count){ } } } - ArrayList triples = new ArrayList(set); - Collections.sort(triples, tripleIDComparator); + ArrayList triples = new ArrayList<>(set); + triples.sort(tripleIDComparator); return triples; } public TripleID mapTriple(TripleID tripleID, int num){ if (num == 1){ - long new_subject1 = mapIdSection(tripleID.getSubject(), dictionaryCat.getAllMappings().get("SH1"),dictionaryCat.getAllMappings().get("S1")); - long new_predicate1 = mapIdPredicate(tripleID.getPredicate(), dictionaryCat.getAllMappings().get("P1")); - long new_object1 = mapIdSection(tripleID.getObject(), dictionaryCat.getAllMappings().get("SH1"),dictionaryCat.getAllMappings().get("O1")); + long new_subject1 = mapIdSection(tripleID.getSubject(), dictionaryCat.getAllMappings().get(SectionUtil.SH1),dictionaryCat.getAllMappings().get(SectionUtil.S1)); + long new_predicate1 = mapIdPredicate(tripleID.getPredicate(), dictionaryCat.getAllMappings().get(SectionUtil.P1)); + long new_object1 = mapIdSection(tripleID.getObject(), dictionaryCat.getAllMappings().get(SectionUtil.SH1),dictionaryCat.getAllMappings().get(SectionUtil.O1)); return new TripleID(new_subject1, new_predicate1, new_object1); } else { - long new_subject2 = mapIdSection(tripleID.getSubject(), dictionaryCat.getAllMappings().get("SH2"),dictionaryCat.getAllMappings().get("S2")); - long new_predicate2 = mapIdPredicate(tripleID.getPredicate(), dictionaryCat.getAllMappings().get("P2")); - long new_object2 = mapIdSection(tripleID.getObject(), dictionaryCat.getAllMappings().get("SH2"),dictionaryCat.getAllMappings().get("O2")); + long new_subject2 = mapIdSection(tripleID.getSubject(), dictionaryCat.getAllMappings().get(SectionUtil.SH2),dictionaryCat.getAllMappings().get(SectionUtil.S2)); + long new_predicate2 = mapIdPredicate(tripleID.getPredicate(), dictionaryCat.getAllMappings().get(SectionUtil.P2)); + long new_object2 = mapIdSection(tripleID.getObject(), dictionaryCat.getAllMappings().get(SectionUtil.SH2),dictionaryCat.getAllMappings().get(SectionUtil.O2)); return new TripleID(new_subject2, new_predicate2, new_object2); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorMapDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorMapDiff.java index 7a86d2f4..3b953e58 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorMapDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriplesIteratorMapDiff.java @@ -2,9 +2,9 @@ import org.rdfhdt.hdt.compact.bitmap.Bitmap; -import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; import org.rdfhdt.hdt.dictionary.DictionaryDiff; import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMapping; +import org.rdfhdt.hdt.dictionary.impl.utilCat.SectionUtil; import org.rdfhdt.hdt.enums.ResultEstimationType; import org.rdfhdt.hdt.enums.TripleComponentOrder; import org.rdfhdt.hdt.exceptions.NotImplementedException; @@ -14,7 +14,9 @@ import org.rdfhdt.hdt.triples.TripleIDComparator; import org.rdfhdt.hdt.triples.Triples; -import java.util.*; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; public class BitmapTriplesIteratorMapDiff implements IteratorTripleID { @@ -33,10 +35,10 @@ public class BitmapTriplesIteratorMapDiff implements IteratorTripleID { Bitmap bitArrayDisk; public BitmapTriplesIteratorMapDiff(HDT hdtOriginal, Bitmap deleteBitmap, DictionaryDiff dictionaryDiff) { - this.subjMapping = dictionaryDiff.getAllMappings().get("subject"); - this.objMapping = dictionaryDiff.getAllMappings().get("object"); - this.predMapping = dictionaryDiff.getAllMappings().get("predicate"); - this.sharedMapping = dictionaryDiff.getAllMappings().get("shared"); + this.subjMapping = dictionaryDiff.getAllMappings().get(SectionUtil.SECTION_SUBJECT); + this.objMapping = dictionaryDiff.getAllMappings().get(SectionUtil.SECTION_OBJECT); + this.predMapping = dictionaryDiff.getAllMappings().get(SectionUtil.SECTION_PREDICATE); + this.sharedMapping = dictionaryDiff.getAllMappings().get(SectionUtil.SECTION_SHARED); this.dictionaryDiff = dictionaryDiff; this.countTriples = Math.max(0, hdtOriginal.getTriples().getNumberOfElements() - deleteBitmap.countOnes()); this.triples = hdtOriginal.getTriples(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java index b6decac2..abe9e157 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java @@ -1,19 +1,21 @@ package org.rdfhdt.hdt.util; +import org.rdfhdt.hdt.util.string.ByteString; + import java.util.Iterator; import java.util.Map; public class CustomIterator implements Iterator { - public CharSequence prev = ""; + public ByteString prev = ByteString.empty(); boolean first = true; Iterator iter; - Map literalsCounts; + Map literalsCounts; private long currCount; - public CustomIterator(Iterator iter, Map literalsCounts) { + public CustomIterator(Iterator iter, Map literalsCounts) { this.iter = iter; this.literalsCounts = literalsCounts; if(iter.hasNext()) { - prev = iter.next(); + prev = ByteString.of(iter.next()); currCount = literalsCounts.get(LiteralsUtils.getType(prev)); currCount--; } else { @@ -27,7 +29,7 @@ public boolean hasNext() { if(first) return true; if(iter.hasNext()){ - prev = iter.next(); + prev = ByteString.of(iter.next()); currCount = literalsCounts.get(LiteralsUtils.getType(prev)); currCount--; first = true; @@ -43,7 +45,7 @@ public CharSequence next() { if(first) { first = false; } else { - prev = iter.next(); + prev = ByteString.of(iter.next()); currCount--; } return LiteralsUtils.removeType(prev); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java index 4353131d..30f61fb0 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java @@ -1,6 +1,7 @@ package org.rdfhdt.hdt.util; import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.DelayedString; @@ -14,8 +15,8 @@ public class LiteralsUtils { /** * no datatype type */ - public static final CharSequence NO_DATATYPE = new CompactString(NO_DATATYPE_STR); - public static final CharSequence LITERAL_LANG_TYPE = new CompactString(LITERAL_LANG_TYPE_STR); + public static final ByteString NO_DATATYPE = new CompactString(NO_DATATYPE_STR); + public static final ByteString LITERAL_LANG_TYPE = new CompactString(LITERAL_LANG_TYPE_STR); /** * test if the node is a literal and contains a language @@ -151,7 +152,7 @@ public static CharSequence litToPref(CharSequence str) { if (containsLanguage(str)) { ReplazableString prefixedValue = new ReplazableString(2 + LITERAL_LANG_TYPE.length() + str.length()); prefixedValue.append(new byte[]{'^', '^'}, 0, 2); - prefixedValue.append(((CompactString) LITERAL_LANG_TYPE).getData()); + prefixedValue.append(LITERAL_LANG_TYPE.getBuffer(), 0, LITERAL_LANG_TYPE.length()); prefixedValue.appendNoCompact(str); return prefixedValue; } @@ -245,8 +246,7 @@ public static CharSequence prefToLit(CharSequence str) { * @param s1 string * @return embed version of s1 */ - public static CharSequence embed(CharSequence s1) { - s1 = DelayedString.unwrap(s1); + public static ByteString embed(ByteString s1) { if (s1 == null || s1.length() == 0) { return EmbeddedURI.EMPTY; } @@ -256,12 +256,14 @@ public static CharSequence embed(CharSequence s1) { return new EmbeddedURI(s1); } - private static class EmbeddedURI implements CharSequence { - private static final CharSequence EMPTY = new CompactString("<>"); + private static class EmbeddedURI implements ByteString { + private static final ByteString START = new CompactString("<"); + private static final ByteString END = new CompactString(">"); + private static final ByteString EMPTY = new CompactString("<>"); private int hash; - private final CharSequence parent; + private final ByteString parent; - public EmbeddedURI(CharSequence parent) { + public EmbeddedURI(ByteString parent) { this.parent = parent; } @@ -282,12 +284,25 @@ public char charAt(int index) { } @Override - public CharSequence subSequence(int start, int end) { + public byte[] getBuffer() { + byte[] buffer = new byte[START.length() + parent.length() + END.length()]; + System.arraycopy(START.getBuffer(), 0, buffer, 0, START.length()); + System.arraycopy(parent.getBuffer(), 0, buffer, START.length(), parent.length()); + System.arraycopy(END.getBuffer(), 0, buffer, START.length() + parent.length(), END.length()); + return buffer; + } + + @Override + public ByteString subSequence(int start, int end) { if (start == 0 && end == length()) { return this; } - if (start == 0 || end == length()) { - return new CompactString(this.toString().subSequence(start, end)); + + if (start == 0) { + return START.copyAppend(parent.subSequence(0, end - 1)); + } + if (end == length()) { + return parent.subSequence(start - 1, parent.length()).copyAppend(END); } return parent.subSequence(start - 1, end - 1); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java index ebdcc534..01c24095 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java @@ -5,6 +5,7 @@ import org.rdfhdt.hdt.util.crc.CRC32; import org.rdfhdt.hdt.util.crc.CRC8; import org.rdfhdt.hdt.util.crc.CRCOutputStream; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.ReplazableString; @@ -30,15 +31,9 @@ public CompressNodeWriter(OutputStream stream, long size) throws IOException { } public void appendNode(IndexedNode node) throws IOException { - CharSequence str = node.getNode(); + ByteString str = node.getNode(); long index = node.getIndex(); - // to avoid bad longestCommonPrefix call - // cf: https://github.com/rdfhdt/hdt-java/issues/165 - if (str instanceof String) { - str = new CompactString(str); - } - // Find common part. int delta = ByteStringUtil.longestCommonPrefix(previousStr, str); // Write Delta in VByte diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java index 369dc8fa..7730c2d2 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java @@ -4,7 +4,6 @@ import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.triples.IndexedNode; import org.rdfhdt.hdt.util.string.ByteString; -import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.ReplazableString; import java.io.IOException; @@ -184,7 +183,7 @@ public boolean hasNext() { } while (it.hasNext()) { IndexedNode node = it.next(); - ByteString next = (ByteString) node.getNode(); + ByteString next = node.getNode(); int cmp = prev.compareTo(next); assert cmp <= 0: "bad order : " + prev + " > " + next; if (cmp == 0) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixListener.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixListener.java index af452c38..1f253156 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixListener.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixListener.java @@ -1,5 +1,6 @@ package org.rdfhdt.hdt.util.listener; +import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.listener.ProgressListener; /** @@ -7,36 +8,83 @@ * * @author Antoine Willerval */ -public class PrefixListener implements ProgressListener { +public abstract class PrefixListener implements ProgressListener { + private static class SingleThreadPrefixListener extends PrefixListener { + private final ProgressListener listener; + + private SingleThreadPrefixListener(String prefix, ProgressListener listener) { + super(prefix); + this.listener = listener; + } + + @Override + public void clearThreads() { + // do nothing + } + + @Override + public void notifyProgress(float level, String message) { + listener.notifyProgress(level, prefix + message); + } + } + private static class MultiThreadPrefixListener extends PrefixListener implements MultiThreadListener { + private final MultiThreadListener listener; + + private MultiThreadPrefixListener(String prefix, MultiThreadListener listener) { + super(prefix); + this.listener = listener; + } + + + @Override + public void notifyProgress(String thread, float level, String message) { + listener.notifyProgress(thread, level, prefix + message); + } + + @Override + public void unregisterAllThreads() { + listener.unregisterAllThreads(); + } + + @Override + public void clearThreads() { + unregisterAllThreads(); + } + + @Override + public void registerThread(String threadName) { + listener.registerThread(threadName); + } + + @Override + public void unregisterThread(String threadName) { + listener.unregisterThread(threadName); + } + } /** - * create a prefix listener from another listener + * create a prefix listener from another listener, allow multi-thread listener * * @param prefix prefix to concat to the messages * @param listener the listener * @return null if listener is null, listener if prefix is null or empty or a prefix listener */ - public static ProgressListener of(String prefix, ProgressListener listener) { + public static PrefixListener of(String prefix, ProgressListener listener) { if (listener == null) { return null; } - if (prefix == null || prefix.isEmpty()) { - return listener; + if (listener instanceof MultiThreadListener) { + return new MultiThreadPrefixListener(prefix, (MultiThreadListener) listener); + } else { + return new SingleThreadPrefixListener(prefix, listener); } - - return new PrefixListener(prefix, listener); } - private final String prefix; - private final ProgressListener listener; + protected final String prefix; - private PrefixListener(String prefix, ProgressListener listener) { + private PrefixListener(String prefix) { this.prefix = prefix; - this.listener = listener; } - @Override - public void notifyProgress(float level, String message) { - listener.notifyProgress(level, prefix + message); - } + public abstract void clearThreads(); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java index 1ca37c31..2c1ab8bb 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteString.java @@ -1,11 +1,29 @@ package org.rdfhdt.hdt.util.string; +/** + * ByteString char sequence, can't be compared with string, faster than string with IO + */ public interface ByteString extends CharSequence, Comparable { - int UTF8_BIG = 2; + /** + * @return empty byte string + */ + static ByteString empty() { + return CompactString.EMPTY; + } + + /** + * convert (if required) to a ByteString, this method might not copy the ByteString + * + * @param sec char sequence + * @return byte string + */ static ByteString of(CharSequence sec) { return ByteStringUtil.asByteString(sec); } + /** + * @return the buffer associated with this byte string, the maximum size should be read with {@link #length()} + */ byte[] getBuffer(); /* (non-Javadoc) @@ -14,18 +32,50 @@ static ByteString of(CharSequence sec) { @Override default int compareTo(ByteString other) { int n = Math.min(length(), other.length()); - byte[] buffer1 = getBuffer(); - byte[] buffer2 = other.getBuffer(); - int k = 0; while (k < n) { - byte c1 = buffer1[k]; - byte c2 = buffer2[k]; + char c1 = charAt(k); + char c2 = other.charAt(k); if (c1 != c2) { - return (c1 & 0xFF) - (c2 & 0xFF); + return c1 - c2; } k++; } - return length() - other.length(); + return length() - other.length(); + } + + @Override + ByteString subSequence(int start, int end); + + /** + * copy this string and append another string + * + * @param other other string + * @return new byte string + */ + default ByteString copyAppend(CharSequence other) { + return copyAppend(ByteString.of(other)); + } + + /** + * copy this string and append another string + * + * @param other other string + * @return new byte string + */ + default ByteString copyAppend(ByteString other) { + byte[] buffer = new byte[length() + other.length()]; + // prefix + System.arraycopy(getBuffer(), 0, buffer, 0, length()); + // text + System.arraycopy(other.getBuffer(), 0, buffer, length(), other.length()); + return new CompactString(buffer); + } + + /** + * @return copy this byte string into another one + */ + default ByteString copy() { + return new CompactString(this); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CompactString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CompactString.java index a0109a53..7ae07c66 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CompactString.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CompactString.java @@ -67,13 +67,8 @@ public CompactString(String other) { } public CompactString(CharSequence other) { - if (other instanceof CompactString) { - CompactString str = (CompactString) other; - data = Arrays.copyOf(str.data, str.data.length); - hash = str.hash; - } else if (other instanceof ReplazableString) { - ReplazableString str = (ReplazableString) other; - data = Arrays.copyOf(str.buffer, str.used); + if (other instanceof ByteString) { + data = Arrays.copyOf(((ByteString) other).getBuffer(), other.length()); } else { data = other.toString().getBytes(ByteStringUtil.STRING_ENCODING); } @@ -121,7 +116,7 @@ public int length() { } @Override - public CharSequence subSequence(int start, int end) { + public ByteString subSequence(int start, int end) { if (start < 0 || end > (this.length()) || (end-start)<0) { throw new IllegalArgumentException("Illegal range " + start + "-" + end + " for sequence of length " + length()); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java index ef9be8f4..70290a65 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java @@ -68,7 +68,11 @@ public ReplazableString(byte [] buffer) { public byte [] getBuffer() { return buffer; } - + + public void clear() { + used = 0; + } + private void ensureSize(int size) { if(size>buffer.length) { buffer = Arrays.copyOf(buffer, Math.max(size, buffer.length * 2)); @@ -98,15 +102,15 @@ public void append(CharSequence other) { public void appendNoCompact(CharSequence other) { other = DelayedString.unwrap(other); - if (other instanceof ReplazableString) { - ReplazableString rs = (ReplazableString) other; - this.append(rs.getBuffer(), 0, rs.used); - } else if (other instanceof CompactString) { - this.append(((CompactString) other).getData()); + if (other instanceof ByteString) { + this.appendNoCompact((ByteString) other); } else { this.append(other.toString().getBytes(ByteStringUtil.STRING_ENCODING)); } } + public void appendNoCompact(ByteString other) { + this.append(other.getBuffer(), 0, other.length()); + } public void appendNoCompact(CharSequence other, int offset, int length) { other = DelayedString.unwrap(other); @@ -332,7 +336,7 @@ public boolean equals(Object o) { * @see java.lang.CharSequence#subSequence(int, int) */ @Override - public CharSequence subSequence(int start, int end) { + public ByteString subSequence(int start, int end) { if (start < 0 || end > (this.length()) || (end-start)<0) { throw new IllegalArgumentException("Illegal range " + start + "-" + end + " for sequence of length " + length()); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java index 139c587a..2b921e42 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/utils/DebugOrderNodeIterator.java @@ -1,6 +1,7 @@ package org.rdfhdt.hdt.utils; import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.ReplazableString; @@ -59,7 +60,7 @@ private DebugOrderNodeIterator(Comparator comparator, String name, @Override public void accept(IndexedNode obj) { - CharSequence node = obj.getNode(); + ByteString node = obj.getNode(); if (prevBuffer.length() != 0) { int cmp = comparator.compare(prevBuffer, node); if (cmp == 0 && !allowDuplicated) { diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java index 600a9e4b..ec8c3545 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -36,6 +36,9 @@ import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.io.compress.CompressTest; +import org.rdfhdt.hdt.util.string.ByteString; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.ReplazableString; import java.io.File; import java.io.IOException; @@ -45,11 +48,14 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Random; +import java.util.stream.Stream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -65,7 +71,7 @@ HDTManagerTest.StaticTest.class }) public class HDTManagerTest { - private static class HDTManagerTestBase extends AbstractMapMemoryTest implements ProgressListener { + public static class HDTManagerTestBase extends AbstractMapMemoryTest implements ProgressListener { protected static String[][] diskDict() { return new String[][]{ // {HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH}, @@ -73,22 +79,38 @@ protected static String[][] diskDict() { }; } + /** + * disable string order consistency test GH#177 + */ + protected static final boolean ALLOW_STRING_CONSISTENCY_TEST = false; protected static final long SIZE_VALUE = 1L << 16; protected static final int SEED = 67; + + private HDTManagerTestBase() { + } + @Rule public TemporaryFolder tempDir = new TemporaryFolder(); protected HDTSpecification spec; + protected Path rootFolder; @Before public void setupManager() throws IOException { spec = new HDTSpecification(); - spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, tempDir.newFolder().getAbsolutePath()); + rootFolder = tempDir.newFolder().toPath(); + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, rootFolder.toAbsolutePath().toString()); ExceptionThread.startDebug(); } @After - public void closeManager() { + public void closeManager() throws IOException { ExceptionThread.endDebug(); + if (Files.exists(rootFolder)) { + try (Stream s = Files.list(rootFolder)) { + // might be wrong with some OS hidden files? + assertFalse("root folder not empty", s.findAny().isPresent()); + } + } } @Override @@ -96,7 +118,8 @@ public void notifyProgress(float level, String message) { // System.out.println("[" + level + "] " + message); } - protected void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundException { + public static void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundException { + assertEquals("non matching sizes", expected.getTriples().getNumberOfElements(), actual.getTriples().getNumberOfElements()); // test dictionary Dictionary ed = expected.getDictionary(); Dictionary ad = actual.getDictionary(); @@ -115,7 +138,7 @@ protected void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundExceptio assertEqualsHDT(key.toString(), dictE, dictA); }); } else { - assertFalse(ad instanceof MultipleBaseDictionary); + assertFalse("actual dictionary is of type MultipleBaseDictionary, but ed is, actual: " + ad.getClass() + ", excepted: " + ed.getClass(), ad instanceof MultipleBaseDictionary); assertEqualsHDT("Objects", ed.getObjects(), ad.getObjects()); } assertEqualsHDT("Shared", ed.getShared(), ad.getShared()); @@ -149,7 +172,69 @@ protected void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundExceptio } } - protected void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { + public static void checkHDTConsistency(HDT hdt) { + Dictionary dict = hdt.getDictionary(); + Map map; + map = new HashMap<>(); + if (dict instanceof MultipleBaseDictionary) { + map.putAll(dict.getAllObjects()); + } else { + map.put("Objects", dict.getObjects()); + } + map.put("Subjects", dict.getSubjects()); + map.put("Predicates", dict.getPredicates()); + map.put("Shared", dict.getShared()); + + ReplazableString prev = new ReplazableString(); + Comparator cmp = CharSequenceComparator.getInstance(); + map.forEach((name, section) -> { + prev.clear(); + String prev2 = ""; + Iterator it = section.getSortedEntries(); + if (it.hasNext()) { + prev.replace(it.next()); + } + + while (it.hasNext()) { + CharSequence next = ByteString.of(it.next()); + + int cmpV = cmp.compare(prev, next); + if (cmpV >= 0) { + System.out.print("Prev: "); + printHex(prev); + System.out.print("Next: "); + printHex(next); + System.out.print("Prev: "); + printBin(prev); + System.out.print("Next: "); + printBin(next); + + if (cmpV == 0) { + fail("[" + name + "] (BS) Duplicated elements! " + prev + " = " + next); + } + fail("[" + name + "] (BS) Bad order! " + prev + " > " + next); + } + + if (ALLOW_STRING_CONSISTENCY_TEST) { + String nextStr = next.toString(); + int cmpV2 = cmp.compare(prev2, nextStr); + if (cmpV2 == 0) { + fail("[" + name + "] (Str) Duplicated elements! " + prev2 + " = " + next); + } + if (cmpV2 > 0) { + fail("[" + name + "] (Str) Bad order! " + prev2 + " > " + next); + } + + assertEquals("str and byteStr compare aren't returning the same results", Math.signum(cmpV2), Math.signum(cmpV), 0.01); + prev2 = nextStr; + } + prev.replace(next); + } + }); + } + + protected static void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { + assertEquals("sizes of section " + section + " aren't the same!", excepted.getNumberOfElements(), actual.getNumberOfElements()); Iterator itEx = excepted.getSortedEntries(); Iterator itAc = actual.getSortedEntries(); assertEquals("dictionary section sizes don't match", excepted.getNumberOfElements(), actual.getNumberOfElements()); @@ -162,6 +247,25 @@ protected void assertEqualsHDT(String section, DictionarySection excepted, Dicti } assertFalse("dictionary section " + section + " is bigger than excepted", itAc.hasNext()); } + + protected static void printHex(CharSequence seq) { + ByteString bs = ByteString.of(seq); + byte[] buffer = bs.getBuffer(); + int len = bs.length(); + for (int i = 0; i < len; i++) { + System.out.printf("%2x ", buffer[i] & 0xFF); + } + System.out.println(); + } + protected static void printBin(CharSequence seq) { + ByteString bs = ByteString.of(seq); + byte[] buffer = bs.getBuffer(); + int len = bs.length(); + for (int i = 0; i < len; i++) { + System.out.print(Integer.toBinaryString(buffer[i] & 0xFF) + " "); + } + System.out.println(); + } } @RunWith(Parameterized.class) @@ -251,7 +355,8 @@ private void generateDiskTest() throws IOException, ParserException, NotFoundExc .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) .withMaxLiteralSize(maxLiteralSize) - .withSameTripleString(true); + .withSameTripleString(true) + .withUnicode(true); if (spec.getBoolean("debug.disk.slow.stream")) { supplier.withSlowStream(25); @@ -269,6 +374,7 @@ private void generateDiskTest() throws IOException, ParserException, NotFoundExc spec, quiet ? null : this ); + checkHDTConsistency(actual); } finally { if (actual == null) { genActual.getThread().interrupt(); @@ -290,6 +396,7 @@ private void generateDiskTest() throws IOException, ParserException, NotFoundExc spec, null ); + checkHDTConsistency(expected); } finally { if (expected == null) { genExpected.getThread().interrupt(); @@ -316,7 +423,8 @@ public void generateSaveLoadMapTest() throws IOException, ParserException, NotFo LargeFakeDataSetStreamSupplier .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) - .withMaxLiteralSize(maxLiteralSize); + .withMaxLiteralSize(maxLiteralSize) + .withUnicode(true); // create MEMORY HDT @@ -351,10 +459,10 @@ public void generateDiskMemTest() throws IOException, ParserException, NotFoundE public void generateDiskMapTest() throws IOException, ParserException, NotFoundException, InterruptedException { spec.set(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY, size); spec.set("debug.disk.build", true); - File mapHDT = tempDir.newFile("mapHDTTest.hdt"); - spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); + Path mapHDT = tempDir.newFile("mapHDTTest.hdt").toPath(); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.toAbsolutePath()); generateDiskTest(); - Files.deleteIfExists(mapHDT.toPath()); + Files.deleteIfExists(mapHDT); } @Test @@ -363,7 +471,8 @@ public void catTreeTest() throws IOException, ParserException, NotFoundException LargeFakeDataSetStreamSupplier .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) - .withMaxLiteralSize(maxLiteralSize); + .withMaxLiteralSize(maxLiteralSize) + .withUnicode(true); // create DISK HDT LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.NONE); @@ -412,7 +521,8 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep LargeFakeDataSetStreamSupplier .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) - .withMaxLiteralSize(maxLiteralSize); + .withMaxLiteralSize(maxLiteralSize) + .withUnicode(true); spec.set("debug.disk.build", true); @@ -502,7 +612,8 @@ public void catTreeTest() throws IOException, ParserException, NotFoundException LargeFakeDataSetStreamSupplier .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) - .withMaxLiteralSize(maxLiteralSize); + .withMaxLiteralSize(maxLiteralSize) + .withUnicode(true); // create DISK HDT LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.NONE); @@ -552,7 +663,8 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep LargeFakeDataSetStreamSupplier .createSupplierWithMaxSize(maxSize, SEED) .withMaxElementSplit(maxElementSplit) - .withMaxLiteralSize(maxLiteralSize); + .withMaxLiteralSize(maxLiteralSize) + .withUnicode(true); // create DISK HDT LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.NONE); @@ -589,6 +701,8 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep assertNotNull(expected); assertNotNull(actual); try { + checkHDTConsistency(expected); + checkHDTConsistency(actual); assertEqualsHDT(expected, actual); // -1 for the original size ignored by hdtcat } finally { IOUtil.closeAll(expected, actual); @@ -600,8 +714,9 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep public static class FileDynamicTest extends HDTManagerTestBase { @Parameterized.Parameters(name = "{0}") public static Collection params() { - return List.of( - new Object[]{"hdtGenDisk/unicode_disk_encode.nt", true, SIZE_VALUE} + return List.of( + new Object[]{"hdtGenDisk/unicode_disk_encode.nt", true, SIZE_VALUE}, + new Object[]{"unicodeTest.nt", true, SIZE_VALUE} ); } @@ -616,27 +731,25 @@ public static Collection params() { private void generateDiskTest() throws IOException, ParserException, NotFoundException { String ntFile = Objects.requireNonNull(getClass().getClassLoader().getResource(file), "Can't find " + file).getFile(); // create DISK HDT - HDT actual = HDTManager.generateHDTDisk( + try (HDT actual = HDTManager.generateHDTDisk( ntFile, HDTTestUtils.BASE_URI, RDFNotation.NTRIPLES, spec, quiet ? null : this - ); - - // create MEMORY HDT - HDT expected = HDTManager.generateHDT( - ntFile, - HDTTestUtils.BASE_URI, - RDFNotation.NTRIPLES, - spec, - null - ); - - try { - assertEqualsHDT(expected, actual); - } finally { - IOUtil.closeAll(expected, actual); + )) { + // create MEMORY HDT + try (HDT expected = HDTManager.generateHDT( + ntFile, + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + null + )) { + checkHDTConsistency(actual); + checkHDTConsistency(expected); + assertEqualsHDT(expected, actual); + } } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatRandomTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatRandomTest.java index 6746f731..aabd3a79 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatRandomTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtCat/HdtCatRandomTest.java @@ -1,15 +1,17 @@ package org.rdfhdt.hdt.hdtCat; -import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdt.HDTManagerTest; import org.rdfhdt.hdt.hdtDiff.HdtDiffTest; +import org.rdfhdt.hdt.iterator.utils.CombinedIterator; import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; @@ -23,11 +25,15 @@ @RunWith(Parameterized.class) public class HdtCatRandomTest extends AbstractMapMemoryTest { - @Parameterized.Parameters(name = "{0}") + @Parameterized.Parameters(name = "{0} unicode:{2}") public static Collection genParam() { List list = new ArrayList<>(); for (HdtDiffTest.DictionaryTestData data : HdtDiffTest.DICTIONARY_TEST_DATA) { - list.add(new Object[]{data.dictionaryType, data.dictionaryTempType}); + if (data.dictionaryType.equals(HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_PSFC_SECTION)) { + continue; // TODO: not handled? + } + list.add(new Object[]{data.dictionaryType, data.dictionaryTempType, true}); + list.add(new Object[]{data.dictionaryType, data.dictionaryTempType, false}); } return list; } @@ -36,37 +42,59 @@ public static Collection genParam() { @Rule public TemporaryFolder tempDir = new TemporaryFolder(); private final HDTSpecification spec; + private final boolean unicode; - public HdtCatRandomTest(String dictionaryType, String tempDictionaryImpl) { + public HdtCatRandomTest(String dictionaryType, String tempDictionaryImpl, boolean unicode) { spec = new HDTSpecification(); spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, tempDictionaryImpl); + this.unicode = unicode; } @Test - @Ignore("large") - public void largeFakeTest() throws ParserException, IOException { + public void fakeTest() throws ParserException, IOException, NotFoundException { File root = tempDir.newFolder(); String location = new File(root, "catHdt").getAbsolutePath(); String hdt1F = new File(root, "hdt1").getAbsolutePath(); String hdt2F = new File(root, "hdt2").getAbsolutePath(); + String hdtCatExcepted = new File(root, "hdtCatExcepted").getAbsolutePath(); String catOutput = new File(root, "catResult").getAbsolutePath(); - LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_000, 484); - supplier.maxFakeType = 4; - supplier.maxElementSplit = 1000; + long size = 10_000; + long seed = 482; + + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(size, seed) + .withMaxFakeType(4) + .withMaxElementSplit(1000) + .withUnicode(unicode); supplier.createAndSaveFakeHDT(spec, hdt1F); supplier.createAndSaveFakeHDT(spec, hdt2F); + supplier.reset(); + + try (HDT hdtMerge = HDTManager.generateHDT(CombinedIterator.combine( + List.of(supplier.createTripleStringStream(), supplier.createTripleStringStream()) + ), "http://w", spec, null)) { + hdtMerge.saveToHDT(hdtCatExcepted, null); + } + try (HDT cat = HDTManager.catHDT(location, hdt1F, hdt2F, spec, null)) { cat.saveToHDT(catOutput, null); } - HDT loadedHDT = HDTManager.loadIndexedHDT(catOutput, null, spec); - loadedHDT.close(); + try (HDT excepted = HDTManager.loadHDT(hdtCatExcepted, null, spec); + HDT actual = HDTManager.loadHDT(catOutput, null, spec)) { + HDTManagerTest.HDTManagerTestBase.checkHDTConsistency(excepted); + HDTManagerTest.HDTManagerTestBase.checkHDTConsistency(actual); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT(excepted, actual); + } - HDT mappedHDT = HDTManager.mapIndexedHDT(catOutput, spec, null); - mappedHDT.close(); + try (HDT excepted = HDTManager.mapHDT(hdtCatExcepted, null, spec)) { + try (HDT actual = HDTManager.mapHDT(catOutput, null, spec)) { + HDTManagerTest.HDTManagerTestBase.checkHDTConsistency(actual); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT(excepted, actual); + } + } } - } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/CombinedIteratorTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/CombinedIteratorTest.java new file mode 100644 index 00000000..a39c078e --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/CombinedIteratorTest.java @@ -0,0 +1,26 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.junit.Test; + +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.junit.Assert.*; + +public class CombinedIteratorTest { + @Test + public void combineTest() { + List> its = IntStream.range(0, 10).mapToObj( + l -> IntStream.range(l * 100, (l + 1) * 100).boxed().collect(Collectors.toList()).iterator() + ).collect(Collectors.toList()); + + Iterator it = CombinedIterator.combine(its); + IntStream.range(0, 100 * 10).forEach(i -> { + assertTrue(it.hasNext()); + assertEquals(i, (int) it.next()); + }); + } + +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java index e96fdcda..b0d7e2b2 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/IndexNodeDeltaMergeExceptionIteratorTest.java @@ -3,6 +3,8 @@ import org.junit.Test; import org.rdfhdt.hdt.triples.IndexedNode; import org.rdfhdt.hdt.util.string.AssertionCharSequence; +import org.rdfhdt.hdt.util.string.ByteString; +import org.rdfhdt.hdt.util.string.CompactString; import java.util.Iterator; import java.util.List; @@ -126,28 +128,28 @@ public void deltaComputeTest() { new AssertionCharSequence("bbcd", 0) ); - assertEquals("", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(CompactString.EMPTY, ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); assertEquals(0, it.lastDelta()); - assertEquals("", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(CompactString.EMPTY, ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); assertEquals(0, it.lastDelta()); - assertEquals("aaa", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(new CompactString("aaa"), ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); assertEquals(0, it.lastDelta()); - assertEquals("aab", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(new CompactString("aab"), ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); assertEquals(2, it.lastDelta()); - assertEquals("aac", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(new CompactString("aac"), ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); assertEquals(2, it.lastDelta()); - assertEquals("aacd", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(new CompactString("aacd"), ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); assertEquals(3, it.lastDelta()); - assertEquals("abcd", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(new CompactString("abcd"), ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); assertEquals(1, it.lastDelta()); - assertEquals("bbcd", ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); + assertEquals(new CompactString("bbcd"), ((AssertionCharSequence) it.fetchNode().getNode()).getSequence()); assertEquals(0, it.lastDelta()); assertNull(it.fetchNode()); @@ -155,11 +157,11 @@ public void deltaComputeTest() { @Test public void mergeComputeTest() { - List output = List.of( + List output = Stream.of( "", "a", "b" - ); + ).map(ByteString::of).collect(Collectors.toList()); IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it1 = createFromSortedArray( new AssertionCharSequence("a", 0) @@ -176,7 +178,7 @@ public void mergeComputeTest() { 2 ); - Iterator itE = output.iterator(); + Iterator itE = output.iterator(); while (it.hasNext()) { CharSequence sequence = ((AssertionCharSequence) it.next().getNode()).getSequence(); assertTrue("missing: " + sequence, itE.hasNext()); @@ -188,7 +190,7 @@ public void mergeComputeTest() { @Test public void mergeCountComputeTest() { - List output = List.of( + List output = Stream.of( "", "aaa", "aab", @@ -199,7 +201,7 @@ public void mergeCountComputeTest() { "bacdd", "bacde", "bacdz" - ); + ).map(ByteString::of).collect(Collectors.toList()); IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it1 = createFromSortedArray( new AssertionCharSequence("aaa", 0), @@ -223,7 +225,7 @@ public void mergeCountComputeTest() { 2 ); - Iterator itE = output.iterator(); + Iterator itE = output.iterator(); while (it.hasNext()) { CharSequence sequence = ((AssertionCharSequence) it.next().getNode()).getSequence(); assertTrue("missing: " + sequence, itE.hasNext()); @@ -235,7 +237,7 @@ public void mergeCountComputeTest() { @Test public void deepMergeComputeTest() { - List output = List.of( + List output = Stream.of( "", "aa", "aaa", @@ -254,7 +256,7 @@ public void deepMergeComputeTest() { "bze", "cd", "ce" - ); + ).map(ByteString::of).collect(Collectors.toList()); IndexNodeDeltaMergeExceptionIterator.IndexNodeDeltaFetcher it1 = createFromSortedArray( new AssertionCharSequence("aa", 0), @@ -297,10 +299,10 @@ public void deepMergeComputeTest() { ((IndexNodeDeltaMergeExceptionIterator) it).printMergeTree(); - Iterator itE = output.iterator(); + Iterator itE = output.iterator(); while (it.hasNext()) { assertTrue(itE.hasNext()); - CharSequence seq = ((AssertionCharSequence) it.next().getNode()).getSequence(); + ByteString seq = ((AssertionCharSequence) it.next().getNode()).getSequence(); assertEquals(itE.next(), seq); } assertFalse(itE.hasNext()); @@ -311,7 +313,7 @@ public void largeTest() { // (tried with 200_000) final long size = 2_000; Random random = new Random(35); - List randy = Stream.generate(() -> { + List randy = Stream.generate(() -> { String table = "abcd"; StringBuilder bld = new StringBuilder(); // +1 because we don't have empty strings during this step @@ -320,11 +322,11 @@ public void largeTest() { bld.append(table.charAt(bn % table.length())); bn /= table.length(); } - return bld.toString(); + return new CompactString(bld); }) .limit(size) .collect(Collectors.toList()); - List sortedRandy = randy.stream().sorted().collect(Collectors.toList()); + List sortedRandy = randy.stream().sorted().collect(Collectors.toList()); assertEquals(size, sortedRandy.size()); @@ -335,7 +337,7 @@ public void largeTest() { ); int index = 0; - Iterator itE = sortedRandy.iterator(); + Iterator itE = sortedRandy.iterator(); while (it.hasNext()) { assertTrue(itE.hasNext()); CharSequence actual = ((AssertionCharSequence) it.next().getNode()).getSequence(); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index f4d981db..a6729856 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -14,6 +14,7 @@ import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.string.ByteStringUtil; +import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; @@ -21,6 +22,7 @@ import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.io.PrintStream; +import java.io.Writer; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; @@ -46,7 +48,7 @@ public class LargeFakeDataSetStreamSupplier { public static String stringNameOfInt(int i, boolean unicode) { StringBuilder out = new StringBuilder(); if (unicode) { - return "" + (char) (30 + Math.min(i, Character.MAX_VALUE - 30)); + return new String(Character.toChars(Math.min(i, Character.MAX_CODE_POINT))); } else { String table = "abcdefghijklmnopqrstuvwxyz"; int c = i; @@ -155,10 +157,21 @@ public void createNTFile(String file) throws IOException { * @see #createNTFile(java.lang.String) */ public void createNTFile(Path file) throws IOException { - try (FileWriter writer = new FileWriter(file.toFile())) { - for (Iterator it = createTripleStringStream(); it.hasNext(); ) { - it.next().dumpNtriple(writer); - } + try (BufferedWriter writer = Files.newBufferedWriter(file)) { + createNTFile(writer); + } + } + + /** + * create a nt file from the stream + * + * @param writer the writer to write + * @throws IOException io exception + * @see #createNTFile(java.lang.String) + */ + public void createNTFile(Writer writer) throws IOException { + for (Iterator it = createTripleStringStream(); it.hasNext(); ) { + it.next().dumpNtriple(writer); } } @@ -290,7 +303,7 @@ private CharSequence createValue() { int size = random.nextInt(maxLiteralSize); StringBuilder litText = new StringBuilder(); for (int i = 0; i < size; i++) { - litText.append(stringNameOfInt(unicode ? random.nextInt(Character.MAX_VALUE - 30) : random.nextInt(maxElementSplit), unicode)); + litText.append(stringNameOfInt(unicode ? random.nextInt(Character.MAX_CODE_POINT - 30) + 30 : random.nextInt(maxElementSplit), unicode)); } String text = "\"" + litText + "\""; int litType = random.nextInt(3); @@ -308,7 +321,7 @@ private CharSequence createValue() { private class FakeStatementIterator implements Iterator { private long size; - private long count; + private long count = 0; private TripleString buffer; private TripleString next; @@ -320,7 +333,7 @@ private class FakeStatementIterator implements Iterator { @Override public boolean hasNext() { - if (size >= maxSize || count >= maxTriples) { + if (size >= maxSize || count > maxTriples) { return false; } if (next != null) { @@ -358,7 +371,7 @@ public boolean hasNext() { size += estimation; count++; - return size < maxSize && count < maxTriples; + return size < maxSize && count <= maxTriples; } @Override diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java index 37fc5565..7df76445 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java @@ -1,18 +1,25 @@ package org.rdfhdt.hdt.util; -import org.junit.Assert; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.NotFoundException; +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdt.HDTManagerTest; import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.rdf.RDFParserCallback; import org.rdfhdt.hdt.rdf.RDFParserFactory; import org.rdfhdt.hdt.triples.TripleString; import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; +import java.io.BufferedInputStream; +import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.Iterator; @@ -22,31 +29,114 @@ import static org.junit.Assert.assertTrue; public class LargeFakeDataSetStreamSupplierTest { - @Rule - public TemporaryFolder tempDir = new TemporaryFolder(); - @Test - public void streamTest() throws IOException { - LargeFakeDataSetStreamSupplier triples = LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(10, 10); - Path f = tempDir.newFolder().toPath(); - Path testNt = f.resolve("test.nt"); - triples.createNTFile(testNt.toAbsolutePath().toString()); - triples.reset(); - - Iterator it2 = triples.createTripleStringStream(); - try (InputStream is = Files.newInputStream(testNt)) { - try (PipedCopyIterator it = RDFParserFactory.readAsIterator( - RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES), - is, - HDTTestUtils.BASE_URI, - true, - RDFNotation.NTRIPLES - )) { - it.forEachRemaining(s -> { - assertTrue(it2.hasNext()); - assertEquals(it2.next(), s); - }); - assertFalse(it.hasNext()); - } - } - } + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + @Test + public void streamTest() throws IOException { + LargeFakeDataSetStreamSupplier triples = LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(10, 10); + Path f = tempDir.newFolder().toPath(); + Path testNt = f.resolve("test.nt"); + triples.createNTFile(testNt.toAbsolutePath().toString()); + triples.reset(); + + Iterator it2 = triples.createTripleStringStream(); + try (InputStream is = Files.newInputStream(testNt)) { + try (PipedCopyIterator it = RDFParserFactory.readAsIterator( + RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES), + is, + HDTTestUtils.BASE_URI, + true, + RDFNotation.NTRIPLES + )) { + it.forEachRemaining(s -> { + assertTrue(it2.hasNext()); + assertEquals(it2.next(), s); + }); + assertFalse(it.hasNext()); + } + } + } + + @Test + public void countTest() { + long size = 42; + Iterator it = LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(size, 34) + .createTripleStringStream(); + int count = 0; + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(size, count); + } + + @Test + public void mergeTest() throws IOException, ParserException, NotFoundException { + Path root = tempDir.getRoot().toPath(); + long size = 42; + long seed = 54; + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier.createSupplierWithMaxSize(size, seed); + + Path p12 = root.resolve("p12.nt"); + Path p12HDT = root.resolve("p12.hdt"); + Path p3 = root.resolve("p3.nt"); + Path p3HDT = root.resolve("p3.hdt"); + + try (BufferedWriter w = Files.newBufferedWriter(p12)) { + supplier.createNTFile(w); + supplier.createNTFile(w); + } + + LargeFakeDataSetStreamSupplier supplier2 = LargeFakeDataSetStreamSupplier.createSupplierWithMaxSize(size * 2, seed); + + supplier2.createNTFile(p3); + + RDFParserCallback parser = RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES, true); + try { + try ( + InputStream stream = new BufferedInputStream(Files.newInputStream(p12)); + InputStream stream2 = new BufferedInputStream(Files.newInputStream(p3)); + PipedCopyIterator it1 = RDFParserFactory.readAsIterator(parser, stream, "http://w", true, RDFNotation.NTRIPLES); + PipedCopyIterator it2 = RDFParserFactory.readAsIterator(parser, stream2, "http://w", true, RDFNotation.NTRIPLES) + ) { + while (it1.hasNext()) { + assertTrue(it2.hasNext()); + assertEquals(it1.next(), it2.next()); + } + assertFalse(it2.hasNext()); + } + try ( + HDT exceptedHdt = HDTManager.generateHDT( + p12.toAbsolutePath().toString(), "http://w", + RDFNotation.NTRIPLES, new HDTSpecification(), null + ); + HDT actualHDT = HDTManager.generateHDT( + p3.toAbsolutePath().toString(), "http://w", + RDFNotation.NTRIPLES, new HDTSpecification(), null + ) + ) { + exceptedHdt.saveToHDT(p12HDT.toAbsolutePath().toString(), null); + actualHDT.saveToHDT(p3HDT.toAbsolutePath().toString(), null); + } + + try ( + HDT p1HDTLoad = HDTManager.mapHDT(p12HDT.toAbsolutePath().toString()); + HDT p3HDTLoad = HDTManager.mapHDT(p3HDT.toAbsolutePath().toString()) + ) { + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT(p1HDTLoad, p3HDTLoad); + supplier2.reset(); + try (HDT actual = supplier2.createFakeHDT(new HDTSpecification())) { + HDTManagerTest.HDTManagerTestBase.checkHDTConsistency(actual); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT(p3HDTLoad, actual); + } + } + } finally { + try { + Files.deleteIfExists(p12); + } finally { + Files.deleteIfExists(p3); + } + } + } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/UnicodeEscapeTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/UnicodeEscapeTest.java index 927e84a5..bbcf1021 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/UnicodeEscapeTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/UnicodeEscapeTest.java @@ -57,7 +57,7 @@ public void encodeTest() throws ParserException { @Test public void decodeTest() { assertEquals( - "\uD877\uDD76", + new String(Character.toChars(0x0002dd76)), UnicodeEscape.unescapeString("\\U0002dd76") ); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java index 6354f4dd..166fe11a 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java @@ -6,6 +6,7 @@ import org.rdfhdt.hdt.triples.IndexedNode; import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.DelayedString; import java.util.Arrays; import java.util.HashSet; diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequence.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequence.java index ee024ab5..51d3e446 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequence.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/AssertionCharSequence.java @@ -5,8 +5,8 @@ /** * CharSequence wrapper throwing an {@link java.lang.AssertionError} if we try to read before the minimum index */ -public class AssertionCharSequence implements CharSequence { - private final CharSequence sequence; +public class AssertionCharSequence implements ByteString { + private final ByteString sequence; private final int minimumRead; /** @@ -15,10 +15,19 @@ public class AssertionCharSequence implements CharSequence { * @param sequence wrapped sequence * @param minimumRead minimum index to read (inclusive) */ - public AssertionCharSequence(CharSequence sequence, int minimumRead) { + public AssertionCharSequence(ByteString sequence, int minimumRead) { this.sequence = sequence; this.minimumRead = minimumRead; } + /** + * create an assertion cs + * + * @param sequence wrapped sequence + * @param minimumRead minimum index to read (inclusive) + */ + public AssertionCharSequence(CharSequence sequence, int minimumRead) { + this(ByteString.of(sequence), minimumRead); + } @Override public int length() { @@ -34,7 +43,7 @@ public char charAt(int index) { } @Override - public CharSequence subSequence(int start, int end) { + public ByteString subSequence(int start, int end) { if (start < minimumRead) { throw new AssertionError("Tried to create subSequence before minimum index! " + start + " / " + minimumRead); } @@ -56,7 +65,12 @@ public IntStream codePoints() { return sequence.codePoints(); } - public CharSequence getSequence() { + public ByteString getSequence() { return sequence; } + + @Override + public byte[] getBuffer() { + return sequence.getBuffer(); + } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/ByteStringTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/ByteStringTest.java index 9001c048..e884dd21 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/ByteStringTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/string/ByteStringTest.java @@ -1,48 +1,33 @@ package org.rdfhdt.hdt.util.string; +import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import java.text.Collator; +import static org.junit.Assert.assertEquals; + public class ByteStringTest { - private static void printHex(byte[] b) { - for (byte bb : b) { - System.out.printf("%2x ", bb); - } - System.out.println(); - } - private static void printBin(byte[] b) { - for (byte bb : b) { - String s = Integer.toBinaryString(bb & 0xFF); - System.out.print("0".repeat(8 - s.length()) + s + " "); - } - System.out.println(); - } @Test + @Ignore("failing https://github.com/rdfhdt/hdt-java/issues/177") public void utf32Test() { - String ss1 = "\uD85B\uDCE3"; - String ss2 = "\uF4D1"; - - ByteString b1 = ByteString.of(ss1); - ByteString b2 = ByteString.of(ss2); + String ss1 = new String(Character.toChars(0x26ce3)); // 𦳣 + String ss2 = new String(Character.toChars(0xf4d1)); //  - assert ss1.equals(b1.toString()); - assert ss2.equals(b2.toString()); + System.out.println(ss1.compareTo(ss2)); + System.out.println(Integer.compare(0x26ce3, 0xf4d1)); - Collator coll = Collator.getInstance(); - System.out.println("BYTESTRING: " + b1 + (b1.compareTo(b2) < 0 ? " < " : " > ") + b2); - System.out.println("STRING : " + b1 + (b1.toString().compareTo(b2.toString()) < 0 ? " < " : " > ") + b2); - System.out.println("COLLATOR : " + b1 + (coll.compare(b1.toString(), b2.toString()) < 0 ? " < " : " > ") + b2); + CompactString b1 = new CompactString(ss1); + CompactString b2 = new CompactString(ss2); - printHex(b1.getBuffer()); - printHex(b2.getBuffer()); + assertEquals(ss1, b1.toString()); + assertEquals(ss2, b2.toString()); - printBin(b1.getBuffer()); - printBin(b2.getBuffer()); + int cmpByte = Math.max(-1, Math.min(1, b1.compareTo(b2))); + int cmpStr = Math.max(-1, Math.min(1, b1.toString().compareTo(b2.toString()))); - System.out.println(Character.isHighSurrogate(ss1.charAt(0)) + ", " + Character.isLowSurrogate(ss1.charAt(1))); - System.out.println(Character.toCodePoint(ss1.charAt(0), ss1.charAt(1))); - System.out.println((int) ss2.charAt(0)); + assertEquals(cmpStr, cmpByte); } } diff --git a/hdt-java-package/bin/hdtVerify.bat b/hdt-java-package/bin/hdtVerify.bat new file mode 100644 index 00000000..2c47d8c8 --- /dev/null +++ b/hdt-java-package/bin/hdtVerify.bat @@ -0,0 +1,5 @@ +@echo off + +call "%~dp0\javaenv.bat" + +"%JAVACMD%" %JAVAOPTIONS% -classpath %~dp0\..\lib\* org.rdfhdt.hdt.tools.HDTVerify %* From e7b2cdd935f1b29aaff819e1b3d05ae7363f155f Mon Sep 17 00:00:00 2001 From: qaate47 Date: Thu, 10 Nov 2022 14:59:31 +0100 Subject: [PATCH 8/9] Add genDisk with Multi Section Dictionaries (MSC), progress bar for the rdf2hdt logs and check MSC with hdtVerify --- .../org/rdfhdt/hdt/options/HDTOptions.java | 6 +- .../java/org/rdfhdt/hdt/tools/HDTVerify.java | 156 ++++++++++++++---- .../java/org/rdfhdt/hdt/tools/RDF2HDT.java | 102 ++++-------- .../rdfhdt/hdt/util/listener/ColorTool.java | 83 ++++++++++ .../listener/MultiThreadListenerConsole.java | 34 +++- .../hdt/dictionary/DictionaryFactory.java | 42 ++++- .../impl/MultipleBaseDictionary.java | 5 - .../impl/MultipleSectionDictionary.java | 65 +++++++- .../impl/MultipleSectionDictionaryBig.java | 31 ++++ .../impl/WriteMultipleSectionDictionary.java | 110 +++++++----- .../section/OneReadDictionarySection.java | 2 - .../impl/section/WriteDictionarySection.java | 15 +- .../rdfhdt/hdt/hdt/impl/HDTDiskImporter.java | 4 +- .../impl/diskimport/CompressionResult.java | 1 - .../MultiSectionSectionCompressor.java | 23 +++ .../impl/diskimport/SectionCompressor.java | 3 +- .../hdt/iterator/utils/PeekIterator.java | 64 +++++++ .../hdt/iterator/utils/StopIterator.java | 40 +++++ .../org/rdfhdt/hdt/util/LiteralsUtils.java | 28 ++-- .../java/org/rdfhdt/hdt/util/io/IOUtil.java | 5 + .../CompressFourSectionDictionaryTest.java | 1 - .../section/OneReadDictionarySectionTest.java | 6 +- .../org/rdfhdt/hdt/hdt/HDTManagerTest.java | 16 +- .../hdt/triples/impl/utils/HDTTestUtils.java | 7 +- .../rdfhdt/hdt/util/LiteralsUtilsTest.java | 16 +- 25 files changed, 658 insertions(+), 207 deletions(-) create mode 100644 hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/ColorTool.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/MultiSectionSectionCompressor.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PeekIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/StopIterator.java diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java index 183f79d7..e122d173 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java @@ -170,11 +170,11 @@ default RDFFluxStop getFluxStop(String key, RDFFluxStop defaultValue) { * @return long or defaultValue if the value isn't defined */ default long getInt(String key, LongSupplier defaultValue) { - long l = getInt(key); - if (l == 0) { + String l = get(key); + if (l == null) { return defaultValue.getAsLong(); } - return l; + return Long.parseLong(l); } /** diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java index a7cbb17c..dc16d93d 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java @@ -1,28 +1,59 @@ package org.rdfhdt.hdt.tools; +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.internal.Lists; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.impl.MultipleBaseDictionary; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.util.listener.ColorTool; import org.rdfhdt.hdt.util.string.ByteString; -import org.rdfhdt.hdt.util.string.CharSequenceComparator; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.ReplazableString; -import java.util.Comparator; +import java.io.IOException; import java.util.Iterator; +import java.util.List; +import java.util.Map; public class HDTVerify { private HDTVerify() { } - private static void print(byte[] arr) { + @Parameter(description = "") + public List parameters = Lists.newArrayList(); + + @Parameter(names = "-unicode", description = "Ignore UNICODE order") + public boolean unicode; + + @Parameter(names = "-color", description = "Print using color (if available)") + public boolean color; + + @Parameter(names = "-binary", description = "Print binaries of the string in case of signum error") + public boolean binary; + + @Parameter(names = "-quiet", description = "Do not show progress of the conversion") + public boolean quiet; + + @Parameter(names = "-load", description = "Load the HDT in memory for faster results (might be impossible for large a HDT)") + public boolean load; + + public ColorTool colorTool; + + private HDT loadOrMap(String file) throws IOException { + return load ? HDTManager.loadHDT(file) : HDTManager.mapHDT(file); + } + + private void print(byte[] arr) { for (byte b : arr) { System.out.printf("%02X ", b); } System.out.println(); } - private static void print(CharSequence seq) { + private void print(CharSequence seq) { if (seq instanceof CompactString) { CompactString cs1 = (CompactString) seq; print(cs1.getData()); @@ -34,9 +65,10 @@ private static void print(CharSequence seq) { } } - public static void checkDictionarySectionOrder(Iterator it) { + public boolean checkDictionarySectionOrder(Iterator it) { ReplazableString prev = new ReplazableString(); String lastStr = ""; + boolean error = false; while (it.hasNext()) { ByteString charSeq = ByteString.of(it.next()); String str = charSeq.toString(); @@ -44,42 +76,108 @@ public static void checkDictionarySectionOrder(Iterator int cmp = prev.compareTo(charSeq); if (cmp >= 0) { - System.out.println("ERRA: " + prev + " / " + charSeq); + error = true; + if (cmp == 0) { + colorTool.error("Duplicated(bs)", prev + " == " + charSeq); + } else { + colorTool.error("Bad order(bs)", prev + " > " + charSeq); + } } - int cmp2 = lastStr.compareTo(str); + if (!unicode) { + int cmp2 = lastStr.compareTo(str); - if (cmp2 >= 0) { - System.out.println("ERRB: " + lastStr + " / " + str); - } + if (cmp2 >= 0) { + error = true; + if (cmp == 0) { + colorTool.error("Duplicated(str)", lastStr + " == " + str); + } else { + colorTool.error("Bad order(str)", lastStr + " > " + str); + } + } - if (Math.signum(cmp) != Math.signum(cmp2)) { - System.out.println("Not equal: " + cmp + " / " + cmp2); - print(prev); - print(charSeq); - print(lastStr); - print(str); + if (Math.signum(cmp) != Math.signum(cmp2)) { + error = true; + colorTool.error("Not equal", cmp + " != " + cmp2 + " for " + lastStr + " / " + str); + if (binary) { + print(prev); + print(charSeq); + print(lastStr); + print(str); + } + } + + lastStr = str; } prev.replace(charSeq); - lastStr = str; + } + if (error) { + colorTool.warn("Not valid section"); + } else { + colorTool.log("valid section"); + } + return error; + } + + public void exec() throws Throwable { + try (HDT hdt = loadOrMap(parameters.get(0))) { + boolean error; + long count = 0; + if (hdt.getDictionary() instanceof MultipleBaseDictionary) { + colorTool.log("Checking subject entries"); + error = checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); + count += hdt.getDictionary().getSubjects().getNumberOfElements(); + colorTool.log("Checking predicate entries"); + error |= checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); + count += hdt.getDictionary().getPredicates().getNumberOfElements(); + colorTool.log("Checking object entries"); + Map allObjects = hdt.getDictionary().getAllObjects(); + for (Map.Entry entry : allObjects.entrySet()) { + CharSequence sectionName = entry.getKey(); + DictionarySection section = entry.getValue(); + colorTool.log("Checking object section " + sectionName); + error |= checkDictionarySectionOrder(section.getSortedEntries()); + count += section.getNumberOfElements(); + } + colorTool.log("Checking shared entries"); + error |= checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); + count += hdt.getDictionary().getShared().getNumberOfElements(); + } else { + colorTool.log("Checking subject entries"); + error = checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); + count += hdt.getDictionary().getSubjects().getNumberOfElements(); + colorTool.log("Checking predicate entries"); + error |= checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); + count += hdt.getDictionary().getPredicates().getNumberOfElements(); + colorTool.log("Checking object entries"); + error |= checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); + count += hdt.getDictionary().getObjects().getNumberOfElements(); + colorTool.log("Checking shared entries"); + error |= checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); + count += hdt.getDictionary().getShared().getNumberOfElements(); + } + + if (error) { + colorTool.error("This HDT isn't valid", true); + System.exit(-1); + } else { + colorTool.log(count + " element(s) parsed"); + colorTool.log(colorTool.color(0, 5, 0) + "This HDT is valid", true); + } } } public static void main(String[] args) throws Throwable { - if (args.length < 1) { - System.out.println("hdtVerify "); + HDTVerify verify = new HDTVerify(); + JCommander com = new JCommander(verify); + com.parse(args); + verify.colorTool = new ColorTool(verify.color, verify.quiet); + com.setProgramName("hdtVerify"); + if (verify.parameters.size() < 1) { + com.usage(); System.exit(-1); } - try (HDT hdt = HDTManager.mapHDT(args[0], null)) { - System.out.println("Checking subject entries"); - checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); - System.out.println("Checking predicate entries"); - checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); - System.out.println("Checking object entries"); - checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); - System.out.println("Checking shared entries"); - checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); - } + verify.exec(); } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index 33f4e3e8..0fb09a54 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -50,6 +50,7 @@ import com.beust.jcommander.Parameter; import com.beust.jcommander.internal.Lists; import org.rdfhdt.hdt.util.StringUtil; +import org.rdfhdt.hdt.util.listener.ColorTool; import org.rdfhdt.hdt.util.listener.MultiThreadListenerConsole; /** @@ -67,7 +68,9 @@ private static long getMaxTreeCatChunkSize() { public String rdfInput; public String hdtOutput; - + + private ColorTool colorTool; + @Parameter(description = " ") public List parameters = Lists.newArrayList(); @@ -81,7 +84,7 @@ private static long getMaxTreeCatChunkSize() { public String rdfType; @Parameter(names = "-version", description = "Prints the HDT version number") - public static boolean showVersion; + public boolean showVersion; @Parameter(names = "-base", description = "Base URI for the dataset") public String baseURI; @@ -112,6 +115,7 @@ private static long getMaxTreeCatChunkSize() { @Parameter(names = "-printoptions", description = "Print options") public boolean printoptions; + @Parameter(names = "-color", description = "Print using color (if available)") public boolean color; @@ -147,7 +151,7 @@ public void execute() throws ParserException, IOException { } else { baseURI = Path.of(rdfInput).toUri().toString(); } - warn("base uri not specified, using '" + baseURI + "'"); + colorTool.warn("base uri not specified, using '" + baseURI + "'"); } RDFNotation notation = null; @@ -155,7 +159,7 @@ public void execute() throws ParserException, IOException { try { notation = RDFNotation.parse(rdfType); } catch (IllegalArgumentException e) { - warn("Notation " + rdfType + " not recognised."); + colorTool.warn("Notation " + rdfType + " not recognised."); } } @@ -163,12 +167,12 @@ public void execute() throws ParserException, IOException { try { notation = RDFNotation.guess(rdfInput); } catch (IllegalArgumentException e) { - warn("Could not guess notation for " + rdfInput + " Trying NTriples"); + colorTool.warn("Could not guess notation for " + rdfInput + " Trying NTriples"); notation = RDFNotation.NTRIPLES; } } - log("Converting " +rdfInput+" to "+hdtOutput+" as "+notation.name()); + colorTool.log("Converting " +rdfInput+" to "+hdtOutput+" as "+notation.name()); if (ntSimpleLoading) { spec.set("parser.ntSimpleParser", "true"); @@ -185,12 +189,12 @@ public void execute() throws ParserException, IOException { long maxTreeCatChunkSize = getMaxTreeCatChunkSize(); - log("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true)); + colorTool.log("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true)); if (disk) { if (diskLocation != null) { spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, diskLocation); - log("Using temp directory " + diskLocation); + colorTool.log("Using temp directory " + diskLocation); } MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole(color) : null; hdt = HDTManager.catTree( @@ -218,12 +222,12 @@ public void execute() throws ParserException, IOException { } } else if (disk) { if (!quiet) { - log("Generating using generateHDTDisk"); + colorTool.log("Generating using generateHDTDisk"); } spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, hdtOutput); if (diskLocation != null) { spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, diskLocation); - log("Using temp directory " + diskLocation); + colorTool.log("Using temp directory " + diskLocation); } MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole(color) : null; hdt = HDTManager.generateHDTDisk(rdfInput, baseURI, notation, CompressionType.guess(rdfInput), spec, listenerConsole); @@ -237,30 +241,30 @@ public void execute() throws ParserException, IOException { hdt = HDTManager.generateHDT(rdfInput, baseURI, notation, spec, listenerConsole); } - logValue("File converted in ..... ", sw.stopAndShow(), true); + colorTool.logValue("File converted in ..... ", sw.stopAndShow(), true); try { // Show Basic stats if(!quiet){ - logValue("Total Triples ......... ", "" + hdt.getTriples().getNumberOfElements()); - logValue("Different subjects .... ", "" + hdt.getDictionary().getNsubjects()); - logValue("Different predicates .. ", "" + hdt.getDictionary().getNpredicates()); - logValue("Different objects ..... ", "" + hdt.getDictionary().getNobjects()); - logValue("Common Subject/Object . ", "" + hdt.getDictionary().getNshared()); + colorTool.logValue("Total Triples ......... ", "" + hdt.getTriples().getNumberOfElements()); + colorTool.logValue("Different subjects .... ", "" + hdt.getDictionary().getNsubjects()); + colorTool.logValue("Different predicates .. ", "" + hdt.getDictionary().getNpredicates()); + colorTool.logValue("Different objects ..... ", "" + hdt.getDictionary().getNobjects()); + colorTool.logValue("Common Subject/Object . ", "" + hdt.getDictionary().getNshared()); } // Dump to HDT file if (!disk && !catTree) { sw = new StopWatch(); hdt.saveToHDT(hdtOutput, this); - logValue("HDT saved to file in .. ", sw.stopAndShow()); + colorTool.logValue("HDT saved to file in .. ", sw.stopAndShow()); } // Generate index and dump it to .hdt.index file sw.reset(); if(generateIndex) { hdt = HDTManager.indexedHDT(hdt,this); - logValue("Index generated and saved in ", sw.stopAndShow()); + colorTool.logValue("Index generated and saved in ", sw.stopAndShow()); } } finally { if(hdt!=null) hdt.close(); @@ -280,67 +284,33 @@ public void notifyProgress(float level, String message) { } } - private String prefix(String pref, int r, int g, int b) { - return colorReset() + "[" + color(r, g, b) + pref + colorReset() + "]"; - } - - private void log(String msg) { - if (!quiet) { - System.out.println(prefix("INFO", 3, 1, 5) + " " + colorReset() + msg); - } - } - private void logValue(String msg, String value, boolean ignoreQuiet) { - if (!quiet || ignoreQuiet) { - System.out.println(color(3, 1, 5) + msg + colorReset() + value); - } - } - private void logValue(String msg, String value) { - logValue(msg, value, false); - } - private void warn(String msg) { - if (!quiet) { - System.out.println(prefix("WARN", 5, 5, 0) + " " + colorReset() + msg); - } - } - - private String color(int r, int g, int b) { - if (!color) { - return ""; - } - int color = 16 + 36*r + 6 * g + b; - return "\033[38;5;"+color+"m"; - } - - private String colorReset() { - return color ? "\033[0m" : ""; - } - @SuppressWarnings("deprecation") public static void main(String[] args) throws Throwable { RDF2HDT rdf2hdt = new RDF2HDT(); JCommander com = new JCommander(rdf2hdt, args); com.setProgramName("rdf2hdt"); + rdf2hdt.colorTool = new ColorTool(rdf2hdt.color, rdf2hdt.quiet); if (rdf2hdt.printoptions) { Collection values = HDTOptionsKeys.getOptionMap().values(); for (HDTOptionsKeys.Option opt : values) { - System.out.println(rdf2hdt.color(3, 1, 5) + "Key: " + rdf2hdt.color(5, 1, 0) + opt.getKey()); + System.out.println(rdf2hdt.colorTool.color(3, 1, 5) + "Key: " + rdf2hdt.colorTool.color(5, 1, 0) + opt.getKey()); if (!opt.getKeyInfo().desc().isEmpty()) { - System.out.println(rdf2hdt.color(3, 1, 5) + "Desc: " + rdf2hdt.colorReset() + opt.getKeyInfo().desc()); + System.out.println(rdf2hdt.colorTool.color(3, 1, 5) + "Desc: " + rdf2hdt.colorTool.colorReset() + opt.getKeyInfo().desc()); } - System.out.println(rdf2hdt.color(3, 1, 5) + "Type: " + rdf2hdt.colorReset() + opt.getKeyInfo().type().getTitle()); + System.out.println(rdf2hdt.colorTool.color(3, 1, 5) + "Type: " + rdf2hdt.colorTool.colorReset() + opt.getKeyInfo().type().getTitle()); switch (opt.getKeyInfo().type()) { case BOOLEAN: - System.out.println(rdf2hdt.color(3, 1, 5) + "Possible values: " + rdf2hdt.colorReset() + "true|false"); + System.out.println(rdf2hdt.colorTool.color(3, 1, 5) + "Possible values: " + rdf2hdt.colorTool.colorReset() + "true|false"); break; case ENUM: - System.out.println(rdf2hdt.color(3, 1, 5) + "Possible value(s):"); + System.out.println(rdf2hdt.colorTool.color(3, 1, 5) + "Possible value(s):"); int max = opt.getValues().stream().mapToInt(vle -> vle.getValue().length()).max().orElse(0); for (HDTOptionsKeys.OptionValue vle : opt.getValues()) { - System.out.print(rdf2hdt.color(3, 3, 3) + "- " + rdf2hdt.colorReset() + vle.getValue()); + System.out.print(rdf2hdt.colorTool.color(3, 3, 3) + "- " + rdf2hdt.colorTool.colorReset() + vle.getValue()); if (!vle.getValueInfo().desc().isEmpty()) { - System.out.println(rdf2hdt.color(3, 3, 3) + " ".repeat(max - vle.getValue().length()) + " : " + vle.getValueInfo().desc()); + System.out.println(rdf2hdt.colorTool.color(3, 3, 3) + " ".repeat(max - vle.getValue().length()) + " : " + vle.getValueInfo().desc()); } else { System.out.println(); } @@ -356,18 +326,16 @@ public static void main(String[] args) throws Throwable { } if(rdf2hdt.parameters.size()==1) { - System.err.println("No input file specified, reading from standard input."); + rdf2hdt.colorTool.warn("No input file specified, reading from standard input."); rdf2hdt.rdfInput = "-"; rdf2hdt.hdtOutput = rdf2hdt.parameters.get(0); } else if(rdf2hdt.parameters.size()==2) { rdf2hdt.rdfInput = rdf2hdt.parameters.get(0); rdf2hdt.hdtOutput = rdf2hdt.parameters.get(1); - - } else if (showVersion){ - System.out.println(HDTVersion.get_version_string(".")); - System.exit(0); - } - else{ + } else if (rdf2hdt.showVersion){ + System.out.println(HDTVersion.get_version_string(".")); + System.exit(0); + } else { com.usage(); System.exit(1); } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/ColorTool.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/ColorTool.java new file mode 100644 index 00000000..f95c147e --- /dev/null +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/ColorTool.java @@ -0,0 +1,83 @@ +package org.rdfhdt.hdt.util.listener; + +public class ColorTool { + private final boolean color; + private final boolean quiet; + + public ColorTool(boolean color, boolean quiet) { + this.color = color; + this.quiet = quiet; + } + + public ColorTool(boolean color) { + this(color, false); + } + + + public String prefix(String pref, int r, int g, int b) { + return colorReset() + "[" + color(r, g, b) + pref + colorReset() + "]"; + } + + public void log(String msg) { + log(msg, false); + } + public void log(String msg, boolean ignoreQuiet) { + if (!quiet || ignoreQuiet) { + System.out.println(prefix("INFO", 3, 1, 5) + " " + colorReset() + msg); + } + } + + public void logValue(String msg, String value, boolean ignoreQuiet) { + if (!quiet || ignoreQuiet) { + System.out.println(color(3, 1, 5) + msg + colorReset() + value); + } + } + + public void logValue(String msg, String value) { + logValue(msg, value, false); + } + + public void warn(String msg) { + warn(msg, false); + } + + public void warn(String msg, boolean ignoreQuiet) { + if (!quiet || ignoreQuiet) { + System.out.println(prefix("WARN", 5, 5, 0) + " " + colorReset() + msg); + } + } + public void error(String text) { + error(text, false); + } + + + public void error(String text, boolean ignoreQuiet) { + error(null, text, ignoreQuiet); + } + + public void error(String title, String text) { + error(title, text, false); + } + + public void error(String title, String text, boolean ignoreQuiet) { + if (!quiet || ignoreQuiet) { + if (title != null) { + System.out.println(prefix("ERRR", 5, 0, 0) + " " + prefix(title, 5, 3, 0) + " " + colorReset() + text); + } else { + System.out.println(prefix("ERRR", 5, 0, 0) + " " + colorReset() + text); + } + } + } + + public String color(int r, int g, int b) { + if (!color) { + return ""; + } + int color = 16 + 36 * r + 6 * g + b; + return "\033[38;5;" + color + "m"; + } + + public String colorReset() { + return color ? "\033[0m" : ""; + } +} diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java index 43424010..0815ff99 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java @@ -7,6 +7,7 @@ import org.rdfhdt.hdt.listener.MultiThreadListener; public class MultiThreadListenerConsole implements MultiThreadListener { + private static final int BAR_SIZE = 10; private static final String ERASE_LINE = "\r\033[K"; private static String goBackNLine(int line) { @@ -54,6 +55,37 @@ public String color(int r, int g, int b) { return "\033[38;5;" + color + "m"; } + public String backColor(int r, int g, int b) { + if (!color) { + return ""; + } + int color = 16 + 36 * r + 6 * g + b; + return "\033[48;5;" + color + "m"; + } + + public String progressBar(float level) { + String colorBar; + String colorText; + int iv = Math.min(100, Math.max(0, (int) (level))); + if (!color) { + colorText = ""; + colorBar = ""; + } else { + int diff = (iv - 1) % 50 + 1; + int delta = diff * 3 / 50; + if (iv <= 50) { + colorText = color(5 - delta, delta * 2 / 3, 0); + colorBar = backColor(5 - delta, delta * 2 / 3, 0) + colorText; + } else { + colorText = color(2 - delta * 2 / 3, 2 + delta, 0); + colorBar = backColor(2 - delta * 2 / 3, 2 + delta, 0) + colorText; + } + } + int bar = iv * BAR_SIZE / 100; + return colorReset() + "[" + colorBar + "#".repeat(bar) + colorReset() + " ".repeat(BAR_SIZE - bar) + "] " + colorText + String.format(level >= 100 ? "%-5.1f" : "%-5.2f", level); + } + + public String colorReset() { return color ? "\033[0m" : ""; } @@ -92,7 +124,7 @@ public synchronized void unregisterThread(String threadName) { @Override public synchronized void notifyProgress(String thread, float level, String message) { - String msg = colorReset() + "[" + colorPercentage() + String.format(level >= 100 ? "%-5.1f" : "%-5.2f", level) + colorReset() + "] " + message; + String msg = colorReset() + progressBar(level) + colorReset() + " " + message; if (threadMessages != null) { threadMessages.put(thread, msg); render(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java index 81296c29..277e2c18 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java @@ -27,16 +27,30 @@ package org.rdfhdt.hdt.dictionary; -import org.rdfhdt.hdt.dictionary.impl.*; +import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionaryBig; +import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionaryDiff; +import org.rdfhdt.hdt.dictionary.impl.HashDictionary; +import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionaryDiff; +import org.rdfhdt.hdt.dictionary.impl.PSFCFourSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.PSFCTempDictionary; +import org.rdfhdt.hdt.dictionary.impl.WriteFourSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.WriteMultipleSectionDictionary; import org.rdfhdt.hdt.exceptions.IllegalFormatException; import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.hdt.impl.diskimport.MultiSectionSectionCompressor; +import org.rdfhdt.hdt.hdt.impl.diskimport.SectionCompressor; +import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; +import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; import java.nio.file.Path; -import java.util.Objects; /** * Factory that creates Dictionary objects @@ -89,7 +103,7 @@ public static Dictionary createDefaultDictionary() * @return TempDictionary */ public static TempDictionary createTempDictionary(HDTOptions spec) { - String name = Objects.requireNonNullElse(spec.get(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY), ""); + String name = spec.get(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, ""); // Implementations available in the Core switch (name) { @@ -112,7 +126,7 @@ public static TempDictionary createTempDictionary(HDTOptions spec) { * @return Dictionary */ public static DictionaryPrivate createDictionary(HDTOptions spec) { - String name = Objects.requireNonNullElse(spec.get(HDTOptionsKeys.DICTIONARY_TYPE_KEY), ""); + String name = spec.get(HDTOptionsKeys.DICTIONARY_TYPE_KEY, ""); switch (name) { case "": case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION: @@ -137,7 +151,7 @@ public static DictionaryPrivate createDictionary(HDTOptions spec) { * @return WriteDictionary */ public static DictionaryPrivate createWriteDictionary(HDTOptions spec, Path location, int bufferSize) { - String name = Objects.requireNonNullElse(spec.get(HDTOptionsKeys.DICTIONARY_TYPE_KEY), ""); + String name = spec.get(HDTOptionsKeys.DICTIONARY_TYPE_KEY, ""); switch (name) { case "": case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION: @@ -150,6 +164,24 @@ public static DictionaryPrivate createWriteDictionary(HDTOptions spec, Path loca } } + public static SectionCompressor createSectionCompressor(HDTOptions spec, CloseSuppressPath baseFileName, + AsyncIteratorFetcher source, + MultiThreadListener listener, int bufferSize, + long chunkSize, int k, boolean debugSleepKwayDict) { + String name = spec.get(HDTOptionsKeys.DICTIONARY_TYPE_KEY, ""); + + switch (name) { + case "": + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION: + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG: + return new SectionCompressor(baseFileName, source, listener, bufferSize, chunkSize, k, debugSleepKwayDict); + case HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS: + return new MultiSectionSectionCompressor(baseFileName, source, listener, bufferSize, chunkSize, k, debugSleepKwayDict); + default: + throw new IllegalFormatException("Implementation of section compressor not found for " + name); + } + } + /** * Creates a dictionary * diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java index 4b6d819d..8780b44a 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java @@ -307,9 +307,4 @@ public AbstractMap.SimpleEntry getDataTypeRange(CharSequence dataType } return new AbstractMap.SimpleEntry<>(0L,0L); } - - @Override - public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { - throw new NotImplementedException(); - } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java index 48a34a41..b944bdb9 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java @@ -7,15 +7,18 @@ import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory; import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySection; import org.rdfhdt.hdt.exceptions.IllegalFormatException; -import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.iterator.utils.PeekIterator; +import org.rdfhdt.hdt.iterator.utils.StopIterator; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.util.CustomIterator; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -33,6 +36,7 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; +import java.util.function.Predicate; public class MultipleSectionDictionary extends MultipleBaseDictionary { @@ -73,6 +77,60 @@ public void load(TempDictionary other, ProgressListener listener) { shared.load(other.getShared(), iListener); } + private static class StopPredicate implements Predicate { + private CharSequence type; + + @Override + public boolean test(T charSequence) { + CharSequence type = LiteralsUtils.getType(charSequence); + if (this.type == null) { + this.type = type; + return true; + } + return this.type.equals(type); + } + + public void reset() { + this.type = null; + } + } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + IntermediateListener iListener = new IntermediateListener(null); + new ExceptionThread(() -> predicates.load(other.getPredicates(), iListener), "MultiSecSAsyncReaderP") + .attach( + new ExceptionThread(() -> subjects.load(other.getSubjects(), iListener), "MultiSecSAsyncReaderS"), + new ExceptionThread(() -> shared.load(other.getShared(), iListener), "MultiSecSAsyncReaderSh"), + new ExceptionThread(() -> { + StopPredicate pred = new StopPredicate<>(); + PeekIterator it = new PeekIterator<>( + new StopIterator<>( + new MapIterator<>(other.getObjects().getSortedEntries(), b -> LiteralsUtils.prefToLit(ByteString.of(b))), + pred + ) + ); + + while (it.hasNext()) { + PFCDictionarySection section = new PFCDictionarySection(spec); + ByteString type = (ByteString) (LiteralsUtils.getType(it.peek())); + long count; + if (LiteralsUtils.isNoDatatype(type)) { + count = other.getObjects().getNumberOfElements() - shared.getNumberOfElements(); + } else { + // don't know the count + count = 1; + } + section.load(it.map(LiteralsUtils::removeType), count, listener); + pred.reset(); + objects.put(type, section); + } + }, "MultiSecSAsyncReaderO") + ) + .startAll() + .joinAndCrashIfRequired(); + } + /* (non-Javadoc) * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) */ @@ -224,9 +282,4 @@ public void close() throws IOException { // close all subsections IOUtil.closeAll(objects.values()); } - - @Override - public void loadAsync(TempDictionary other, ProgressListener listener) { - throw new NotImplementedException(); - } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java index 5830fb90..59f34904 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java @@ -5,6 +5,7 @@ import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory; import org.rdfhdt.hdt.dictionary.impl.section.HashDictionarySection; +import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySection; import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySectionBig; import org.rdfhdt.hdt.exceptions.IllegalFormatException; import org.rdfhdt.hdt.hdt.HDTVocabulary; @@ -15,6 +16,7 @@ import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.util.CustomIterator; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -27,6 +29,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -69,6 +72,34 @@ public void load(TempDictionary other, ProgressListener listener) { } shared.load(other.getShared(), iListener); } + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + IntermediateListener iListener = new IntermediateListener(null); + new ExceptionThread(() -> predicates.load(other.getPredicates(), iListener), "MultiSecSAsyncReaderP") + .attach( + new ExceptionThread(() -> subjects.load(other.getSubjects(), iListener), "MultiSecSAsyncReaderS"), + new ExceptionThread(() -> shared.load(other.getShared(), iListener), "MultiSecSAsyncReaderSh"), + new ExceptionThread(() -> { + Iterator iter = other.getObjects().getEntries(); + + // TODO: allow the usage of OneReadDictionarySection + Map literalsCounts = new HashMap<>(other.getObjects().getLiteralsCounts()); + literalsCounts.computeIfPresent(LiteralsUtils.NO_DATATYPE, (key, value) -> (value - other.getShared().getNumberOfElements())); + CustomIterator customIterator = new CustomIterator(iter, literalsCounts); + while (customIterator.hasNext()) { + PFCDictionarySection section = new PFCDictionarySection(spec); + ByteString type = ByteString.of(LiteralsUtils.getType(customIterator.prev)); + long numEntries = literalsCounts.get(type); + + section.load(customIterator, numEntries, listener); + section.locate(new CompactString("\"\uD83C\uDDEB\uD83C\uDDF7\"@ro")); + objects.put(type, section); + } + }, "MultiSecSAsyncReaderO") + ) + .startAll() + .joinAndCrashIfRequired(); + } /* (non-Javadoc) * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java index 85cdc185..c7b1f13a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java @@ -7,6 +7,7 @@ import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.iterator.utils.PeekIterator; import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.listener.ProgressListener; @@ -19,14 +20,14 @@ import org.rdfhdt.hdt.util.listener.IntermediateListener; import org.rdfhdt.hdt.util.listener.ListenerUtil; import org.rdfhdt.hdt.util.string.ByteString; -import org.rdfhdt.hdt.util.string.ByteStringUtil; -import org.rdfhdt.hdt.util.string.CharSequenceComparator; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Path; +import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; @@ -45,7 +46,7 @@ public WriteMultipleSectionDictionary(HDTOptions spec, Path filename, int buffer String name = filename.getFileName().toString(); subjects = new WriteDictionarySection(spec, filename.resolveSibling(name + "SU"), bufferSize); predicates = new WriteDictionarySection(spec, filename.resolveSibling(name + "PR"), bufferSize); - objects = new TreeMap<>(CharSequenceComparator.getInstance()); + objects = new TreeMap<>(); shared = new WriteDictionarySection(spec, filename.resolveSibling(name + "SH"), bufferSize); } @@ -54,67 +55,74 @@ public long getNAllObjects() { return objects.values().stream().mapToLong(DictionarySectionPrivate::getNumberOfElements).sum(); } - private ExceptionThread fillSection(Iterator objects, ProgressListener listener) { - PipedCopyIterator noDatatypeIterator = new PipedCopyIterator<>(); - PipedCopyIterator datatypeIterator = new PipedCopyIterator<>(); + private ExceptionThread fillSection(Iterator objects, long count, ProgressListener listener) { + PipedCopyIterator datatypeIterator = new PipedCopyIterator<>(); String name = filename.getFileName().toString(); - WriteDictionarySection noDatatypeSection = new WriteDictionarySection(spec, filename.resolveSibling(name + LiteralsUtils.NO_DATATYPE), bufferSize); - this.objects.put(LiteralsUtils.NO_DATATYPE, noDatatypeSection); + Map theObjects = Collections.synchronizedMap(this.objects); return new ExceptionThread(() -> { // object reader try { - CharSequence oldType = null; - boolean noDatatype = false; - while (objects.hasNext()) { - CharSequence next = objects.next(); + ByteString oldType = null; + long block = count < 10 ? 1 : count / 10; + long currentCount = 0; + for (;objects.hasNext(); currentCount++) { + ByteString next = (ByteString) objects.next(); - CharSequence type = LiteralsUtils.getType(next); + ByteString lit = (ByteString) LiteralsUtils.prefToLit(next); + ByteString type = (ByteString) LiteralsUtils.getType(lit); + + if (currentCount % block == 0) { + listener.notifyProgress((float) (currentCount * 100 / count), "Filling section"); + } if (oldType != null) { if (oldType.equals(type)) { - if (noDatatype) { - noDatatypeIterator.addElement(next); - } else { - datatypeIterator.addElement(next); - } + datatypeIterator.addElement(new TypedByteString(oldType, (ByteString) LiteralsUtils.removeType(lit))); continue; } else { - if (!noDatatype) { - datatypeIterator.closePipe(); - } + datatypeIterator.closePipe(); } } oldType = type; - if (LiteralsUtils.isNoDatatype(type)) { - noDatatypeIterator.addElement(next); - noDatatype = true; - } else { - datatypeIterator.addElement(next); - noDatatype = false; - } + datatypeIterator.addElement(new TypedByteString(oldType, (ByteString) LiteralsUtils.removeType(lit))); } - noDatatypeIterator.closePipe(); + datatypeIterator.closePipe(); datatypeIterator.closePipe(); } catch (Throwable e) { try { throw e; } finally { - try { - noDatatypeIterator.closePipe(e); - } finally { - datatypeIterator.closePipe(e); - } + datatypeIterator.closePipe(e); } } }, "MultiSecSAsyncObjectReader").attach(new ExceptionThread(() -> { // datatype writer - throw new NotImplementedException("MultiSecSAsyncObjectReader"); - }, "MultiSecSAsyncObjectDatatypeWriter")).attach(new ExceptionThread(() -> { - // no datatype writer -// noDatatypeSection.load(new OneReadDictionarySection(noDatatypeIterator), ); - throw new NotImplementedException("MultiSecSAsyncObjectReader"); - }, "MultiSecSAsyncObjectNoDatatypeWriter")); + PeekIterator dataTypePeekIt = new PeekIterator<>(datatypeIterator); + // section id to not having to write an URI on disk + Map sectionIds = new HashMap<>(); + + // check that we have at least one element to read + while (dataTypePeekIt.hasNext()) { + ByteString type = dataTypePeekIt.peek().getType(); + Long sid = sectionIds.get(type); + if (sid != null) { + // check that the section wasn't already defined + throw new IllegalArgumentException("type " + type + " is already defined"); + } + // create a new id + long sidNew = 1L + sectionIds.size(); + sectionIds.put(type, sidNew); + + // create the new section + WriteDictionarySection section = new WriteDictionarySection(spec, filename.resolveSibling(name + "type" + sidNew), bufferSize); + theObjects.put(type, section); + section.load(dataTypePeekIt.map(TypedByteString::getNode), count, null); + + // reset the pipe to allow reading more elements + ((PipedCopyIterator) dataTypePeekIt.getWrappedIterator()).reset(); + } + }, "MultiSecSAsyncObjectDatatypeWriter")); } @Override @@ -125,7 +133,7 @@ public void loadAsync(TempDictionary other, ProgressListener listener) throws In () -> predicates.load(other.getPredicates(), new IntermediateListener(ml, "Predicate: ")), () -> subjects.load(other.getSubjects(), new IntermediateListener(ml, "Subjects: ")), () -> shared.load(other.getShared(), new IntermediateListener(ml, "Shared: ")) - ).attach(fillSection(other.getObjects().getEntries(), new IntermediateListener(ml, "Objects: "))) + ).attach(fillSection(other.getObjects().getEntries(), other.getObjects().getNumberOfElements(), new IntermediateListener(ml, "Objects: "))) .startAll() .joinAndCrashIfRequired(); ml.unregisterAllThreads(); @@ -153,7 +161,7 @@ public void save(OutputStream output, ControlInfo ci, ProgressListener listener) VByte.encode(output, objects.size()); for (Map.Entry entry : objects.entrySet()) { - IOUtil.writeSizedBuffer(output, entry.getKey().toString().getBytes(ByteStringUtil.STRING_ENCODING), listener); + IOUtil.writeSizedBuffer(output, entry.getKey(), listener); } for (Map.Entry entry : objects.entrySet()) { @@ -198,4 +206,22 @@ public void load(TempDictionary other, ProgressListener listener) { throw new NotImplementedException(); } + + private static class TypedByteString { + private final ByteString type; + private final ByteString node; + + public TypedByteString(ByteString type, ByteString node) { + this.type = type; + this.node = node; + } + + public ByteString getNode() { + return node; + } + + public ByteString getType() { + return type; + } + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java index b5db6936..5be82ce6 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java @@ -2,11 +2,9 @@ import org.rdfhdt.hdt.dictionary.TempDictionarySection; import org.rdfhdt.hdt.exceptions.NotImplementedException; -import org.rdfhdt.hdt.util.string.ByteString; import java.io.IOException; import java.util.Iterator; -import java.util.Map; import java.util.concurrent.atomic.AtomicReference; /** diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java index f172d6fb..b6e5f06d 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java @@ -17,8 +17,6 @@ import org.rdfhdt.hdt.util.listener.ListenerUtil; import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; -import org.rdfhdt.hdt.util.string.CompactString; -import org.rdfhdt.hdt.util.string.ReplazableString; import java.io.IOException; import java.io.InputStream; @@ -56,17 +54,20 @@ public WriteDictionarySection(HDTOptions spec, Path filename, int bufferSize) { @Override public void load(TempDictionarySection other, ProgressListener plistener) { + load(other.getSortedEntries(), other.getNumberOfElements(), plistener); + } + + public void load(Iterator it, long count, ProgressListener plistener) { MultiThreadListener listener = ListenerUtil.multiThreadListener(plistener); - long otherN = other.getNumberOfElements(); - long block = otherN < 10 ? 1 : otherN / 10; + long block = count < 10 ? 1 : count / 10; long currentCount = 0; - blocks = new SequenceLog64BigDisk(blockTempFilename.toAbsolutePath().toString(), 64, otherN / blockSize); + blocks = new SequenceLog64BigDisk(blockTempFilename.toAbsolutePath().toString(), 64, count / blockSize); listener.notifyProgress(0, "Filling section"); try (CountOutputStream out = new CountOutputStream(tempFilename.openOutputStream(bufferSize))) { CRCOutputStream crcout = new CRCOutputStream(out, new CRC32()); ByteString previousStr = null; - for (Iterator it = other.getSortedEntries(); it.hasNext(); currentCount++) { + for (; it.hasNext(); currentCount++) { ByteString str = (ByteString) (it.next()); assert str != null; if (numberElements % blockSize == 0) { @@ -86,7 +87,7 @@ public void load(TempDictionarySection other, ProgressListener plistener) { previousStr = str; numberElements++; if (currentCount % block == 0) { - listener.notifyProgress((float) (currentCount * 100 / otherN), "Filling section"); + listener.notifyProgress((float) (currentCount * 100 / count), "Filling section"); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java index 64d3e5fa..72a756dd 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java @@ -1,5 +1,6 @@ package org.rdfhdt.hdt.hdt.impl; +import org.rdfhdt.hdt.dictionary.DictionaryFactory; import org.rdfhdt.hdt.dictionary.DictionaryPrivate; import org.rdfhdt.hdt.dictionary.impl.CompressFourSectionDictionary; import org.rdfhdt.hdt.enums.TripleComponentOrder; @@ -170,7 +171,8 @@ public CompressTripleMapper compressDictionary(Iterator iterator) profiler.pushSection("section compression"); CompressionResult compressionResult; try { - compressionResult = new SectionCompressor( + compressionResult = DictionaryFactory.createSectionCompressor( + hdtFormat, basePath.resolve("sectionCompression"), source, listener, diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java index e8887b40..014b9a73 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java @@ -3,7 +3,6 @@ import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.IndexedNode; -import org.rdfhdt.hdt.util.io.CloseSuppressPath; import java.io.Closeable; import java.io.IOException; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/MultiSectionSectionCompressor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/MultiSectionSectionCompressor.java new file mode 100644 index 00000000..9a560729 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/MultiSectionSectionCompressor.java @@ -0,0 +1,23 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.string.ByteString; +import org.rdfhdt.hdt.util.string.CompactString; + +/** + * Implementation of SectionCompressor for MultiSection + */ +public class MultiSectionSectionCompressor extends SectionCompressor { + public MultiSectionSectionCompressor(CloseSuppressPath baseFileName, AsyncIteratorFetcher source, MultiThreadListener listener, int bufferSize, long chunkSize, int k, boolean debugSleepKwayDict) { + super(baseFileName, source, listener, bufferSize, chunkSize, k, debugSleepKwayDict); + } + + @Override + protected ByteString convertObject(CharSequence seq) { + return new CompactString(LiteralsUtils.litToPref(seq)); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java index 5626cb6d..43f28b2b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java @@ -1,8 +1,6 @@ package org.rdfhdt.hdt.hdt.impl.diskimport; import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; -import org.rdfhdt.hdt.iterator.utils.IndexNodeDeltaMergeExceptionIterator; -import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; import org.rdfhdt.hdt.iterator.utils.SizeFetcher; import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.triples.IndexedNode; @@ -29,6 +27,7 @@ import java.io.OutputStream; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PeekIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PeekIterator.java new file mode 100644 index 00000000..6f64709b --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PeekIterator.java @@ -0,0 +1,64 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.function.Function; + +/** + * Iterator with peek-able element + * + * @param iterator type + */ +public class PeekIterator implements Iterator { + private final Iterator it; + private T next; + + public PeekIterator(Iterator it) { + this.it = it; + } + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + if (!it.hasNext()) { + return false; + } + next = it.next(); + return true; + } + + @Override + public T next() { + try { + return peek(); + } finally { + next = null; + } + } + + /** + * @return peek the element without passing to the next element + */ + public T peek() { + if (hasNext()) { + return next; + } + return null; + } + + /** + * map this iterator + * + * @param mappingFunction func + * @param new type + * @return iterator + */ + public Iterator map(Function mappingFunction) { + return new MapIterator<>(this, mappingFunction); + } + + public Iterator getWrappedIterator() { + return it; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/StopIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/StopIterator.java new file mode 100644 index 00000000..3d60267e --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/StopIterator.java @@ -0,0 +1,40 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.Objects; +import java.util.function.Predicate; + +public class StopIterator implements Iterator { + private final Iterator it; + private T next; + private final Predicate stop; + + public StopIterator(Iterator it, Predicate stop) { + this.it = Objects.requireNonNull(it, "it can't be null!"); + this.stop = Objects.requireNonNull(stop, "stop can't be null!"); + } + + + @Override + public boolean hasNext() { + if (next == null) { + if (!it.hasNext()) { + return false; + } + next = it.next(); + } + return stop.test(next); + } + + @Override + public T next() { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java index 30f61fb0..e33eb6d6 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java @@ -10,7 +10,9 @@ import java.util.ConcurrentModificationException; public class LiteralsUtils { + public static final byte DATATYPE_BYTE = '!'; public static final String NO_DATATYPE_STR = "NO_DATATYPE"; + public static final ByteString TYPE_OPERATOR = ByteString.of("^^"); static final String LITERAL_LANG_TYPE_STR = ""; /** * no datatype type @@ -141,7 +143,7 @@ static boolean isLangType(CharSequence s, int start) { * place the type before the literal * *

example: {@literal "aa"^^} -> {@literal ^^"aa"}

- *

example: "aa" -> "aa"

+ *

example: "aa" -> $"aa"

*

example: "aa"@fr -> {@literal ^^"aa"@fr}

* * @param str the literal @@ -150,8 +152,8 @@ static boolean isLangType(CharSequence s, int start) { public static CharSequence litToPref(CharSequence str) { // language literal if (containsLanguage(str)) { - ReplazableString prefixedValue = new ReplazableString(2 + LITERAL_LANG_TYPE.length() + str.length()); - prefixedValue.append(new byte[]{'^', '^'}, 0, 2); + ReplazableString prefixedValue = new ReplazableString(1 + LITERAL_LANG_TYPE.length() + str.length()); + prefixedValue.append(new byte[]{DATATYPE_BYTE}, 0, 1); prefixedValue.append(LITERAL_LANG_TYPE.getBuffer(), 0, LITERAL_LANG_TYPE.length()); prefixedValue.appendNoCompact(str); return prefixedValue; @@ -162,8 +164,9 @@ public static CharSequence litToPref(CharSequence str) { // typed literal if (index != -1 && index < str.length()) { // add the literal value - ReplazableString prefixedValue = new ReplazableString(str.length()); - prefixedValue.append(new byte[]{'^', '^'}, 0, 2); + // -1 because len("^^") = len(DATATYPE_BYTE) + 1 + ReplazableString prefixedValue = new ReplazableString(str.length() - 1); + prefixedValue.append(new byte[]{DATATYPE_BYTE}, 0, 1); prefixedValue.appendNoCompact(str, index, str.length() - index); prefixedValue.appendNoCompact(str, 0, index - 2); return prefixedValue; @@ -212,17 +215,15 @@ public static CharSequence removePrefType(CharSequence str) { * @return literal */ public static CharSequence prefToLit(CharSequence str) { - if (str.length() < 4 || !(str.charAt(0) == '^' && str.charAt(1) == '^')) { + if (str.length() < 1 || !(str.charAt(0) == DATATYPE_BYTE)) { return str; } - assert str.charAt(2) == '<' : "non typed literal prefix"; - - int index = 3; + int index = 2; - if (isLangType(str, 2)) { + if (isLangType(str, index - 1)) { // lang type, return without the type - return str.subSequence(LITERAL_LANG_TYPE.length() + 2, str.length()); + return str.subSequence(LITERAL_LANG_TYPE.length() + 1, str.length()); } while (index < str.length()) { @@ -234,9 +235,10 @@ public static CharSequence prefToLit(CharSequence str) { } assert index < str.length() - 1 && str.charAt(index + 1) == '"' : "badly typed literal prefix"; - ReplazableString bld = new ReplazableString(str.length()); + ReplazableString bld = new ReplazableString(str.length() + 1); bld.appendNoCompact(str, index + 1, str.length() - index - 1); - bld.appendNoCompact(str, 0, index + 1); + bld.appendNoCompact(TYPE_OPERATOR); + bld.appendNoCompact(str, 1, index); return bld; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java index ca330050..84a457af 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java @@ -32,6 +32,7 @@ import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.util.Reference; +import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.visnow.jlargearrays.LargeArrayUtils; @@ -263,6 +264,10 @@ public static void writeSizedBuffer(OutputStream output, byte[] buffer, Progress writeSizedBuffer(output, buffer, 0, buffer.length, listener); } + public static void writeSizedBuffer(OutputStream output, ByteString str, ProgressListener listener) throws IOException { + writeSizedBuffer(output, str.getBuffer(), 0, str.length(), listener); + } + public static void writeSizedBuffer(OutputStream output, byte[] buffer, int offset, int length, ProgressListener listener) throws IOException { // FIXME: Do by blocks and notify listener VByte.encode(output, length); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java index 02f7d593..c8d27b39 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java @@ -4,7 +4,6 @@ import org.junit.Test; import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; -import org.rdfhdt.hdt.iterator.utils.FileTripleIterator; import org.rdfhdt.hdt.iterator.utils.MapIterator; import org.rdfhdt.hdt.triples.IndexedNode; import org.rdfhdt.hdt.util.concurrent.ExceptionThread; diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java index b6f7f2e1..ff98649b 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java @@ -34,14 +34,12 @@ public void sectionTest() { OneReadDictionarySection sec1 = new OneReadDictionarySection( removeDupe(aa), - aa.size() - ); + aa.size()); assertIteratorEquals(removeDupe(aa), sec1.getSortedEntries()); OneReadDictionarySection sec2 = new OneReadDictionarySection( removeDupe(aa), - aa.size() - ); + aa.size()); PFCDictionarySection section = new PFCDictionarySection(new HDTSpecification()); section.load(sec2, null); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java index ec8c3545..d6edc1ef 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -18,6 +18,7 @@ import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotFoundException; +import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; @@ -27,7 +28,9 @@ import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.rdf.RDFFluxStop; import org.rdfhdt.hdt.rdf.RDFParserFactory; +import org.rdfhdt.hdt.triples.IteratorTripleID; import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.TripleID; import org.rdfhdt.hdt.triples.TripleString; import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; @@ -74,7 +77,7 @@ public class HDTManagerTest { public static class HDTManagerTestBase extends AbstractMapMemoryTest implements ProgressListener { protected static String[][] diskDict() { return new String[][]{ -// {HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH}, + {HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH}, {HDTOptionsKeys.DICTIONARY_TYPE_VALUE_FOUR_SECTION, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_HASH} }; } @@ -145,14 +148,14 @@ public static void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundExce assertEquals(ed.getType(), ad.getType()); // test triples - IteratorTripleString actualIt = actual.search("", "", ""); - IteratorTripleString expectedIt = expected.search("", "", ""); + IteratorTripleID actualIt = actual.getTriples().searchAll(); + IteratorTripleID expectedIt = expected.getTriples().searchAll(); while (expectedIt.hasNext()) { assertTrue(actualIt.hasNext()); - TripleString expectedTriple = expectedIt.next(); - TripleString actualTriple = actualIt.next(); + TripleID expectedTriple = expectedIt.next(); + TripleID actualTriple = actualIt.next(); assertEquals(expectedIt.getLastTriplePosition(), actualIt.getLastTriplePosition()); assertEquals(expectedTriple, actualTriple); } @@ -409,9 +412,6 @@ private void generateDiskTest() throws IOException, ParserException, NotFoundExc assertNotNull(actual); try { assertEqualsHDT(expected, actual); - } catch (Throwable t) { - HDTTestUtils.printCoDictionary(expected, actual); - throw t; } finally { IOUtil.closeAll(expected, actual); } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/utils/HDTTestUtils.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/utils/HDTTestUtils.java index a812f65c..6bec9479 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/utils/HDTTestUtils.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/triples/impl/utils/HDTTestUtils.java @@ -7,6 +7,7 @@ import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.hdt.writer.TripleWriterHDT; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTOptionsKeys; @@ -164,12 +165,12 @@ public static void printCoDictionary(HDT hdt, HDT hdt2) { Map sect1; Map sect2; - if (HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS.equals(dict.getType())) { + if (HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION.equals(dict.getType())) { sect1 = dict.getAllObjects(); - assertEquals(HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, dict2.getType()); + assertEquals(HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION, dict2.getType()); sect2 = dict2.getAllObjects(); } else { - assertNotEquals(HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS, dict2.getType()); + assertNotEquals(HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION, dict2.getType()); Map sect11 = new TreeMap<>(CharSequenceComparator.getInstance()); sect11.put("subjects", dict.getSubjects()); sect11.put("predicates", dict.getPredicates()); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LiteralsUtilsTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LiteralsUtilsTest.java index 75c420a9..04232066 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LiteralsUtilsTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LiteralsUtilsTest.java @@ -34,6 +34,8 @@ public static void assertEqualsCompact(CharSequence excepted, CharSequence actua assertEquals(0, CharSequenceComparator.getInstance().compare(excepted, actual)); } + private static final String LIT_TYPE_DEL = new String(new byte[]{LiteralsUtils.DATATYPE_BYTE}); + @Test public void containsLanguageTest() { assertTrue(LiteralsUtils.containsLanguage("\"hello\"@fr")); @@ -71,12 +73,12 @@ public void litStrTest() { @Test public void litToPrefTest() { assertEqualsCompact("\"aaa\"", LiteralsUtils.litToPref("\"aaa\"")); - assertEqualsCompact("^^\"aaa\"", LiteralsUtils.litToPref("\"aaa\"^^")); - assertEqualsCompact("^^" + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr", LiteralsUtils.litToPref("\"aaa\"@fr-fr")); + assertEqualsCompact( LIT_TYPE_DEL+ "\"aaa\"", LiteralsUtils.litToPref("\"aaa\"^^")); + assertEqualsCompact(LIT_TYPE_DEL + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr", LiteralsUtils.litToPref("\"aaa\"@fr-fr")); assertEqualsCompact("\"aaa\"", LiteralsUtils.litToPref(LiteralsUtils.prefToLit("\"aaa\""))); - assertEqualsCompact("^^\"aaa\"", LiteralsUtils.litToPref(LiteralsUtils.prefToLit("^^\"aaa\""))); - assertEqualsCompact("^^" + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr", LiteralsUtils.litToPref(LiteralsUtils.prefToLit("^^" + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr"))); + assertEqualsCompact(LIT_TYPE_DEL + "\"aaa\"", LiteralsUtils.litToPref(LiteralsUtils.prefToLit(LIT_TYPE_DEL + "\"aaa\""))); + assertEqualsCompact(LIT_TYPE_DEL + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr", LiteralsUtils.litToPref(LiteralsUtils.prefToLit(LIT_TYPE_DEL + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr"))); assertEqualsCompact("", LiteralsUtils.litToPref("")); } @@ -84,11 +86,11 @@ public void litToPrefTest() { @Test public void prefToLitTest() { assertEqualsCompact("\"aaa\"", LiteralsUtils.litToPref("\"aaa\"")); - assertEqualsCompact("\"aaa\"^^", LiteralsUtils.prefToLit("^^\"aaa\"")); - assertEqualsCompact("\"aaa\"@fr-fr", LiteralsUtils.prefToLit("^^" + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr")); + assertEqualsCompact("\"aaa\"^^", LiteralsUtils.prefToLit(LIT_TYPE_DEL + "\"aaa\"")); + assertEqualsCompact("\"aaa\"@fr-fr", LiteralsUtils.prefToLit(LIT_TYPE_DEL + LiteralsUtils.LITERAL_LANG_TYPE_STR + "\"aaa\"@fr-fr")); assertEqualsCompact("", LiteralsUtils.prefToLit("")); - assertEqualsCompact("\"aaa\"", LiteralsUtils.litToPref(LiteralsUtils.litToPref("\"aaa\""))); + assertEqualsCompact("\"aaa\"", LiteralsUtils.prefToLit(LiteralsUtils.litToPref("\"aaa\""))); assertEqualsCompact("\"aaa\"^^", LiteralsUtils.prefToLit(LiteralsUtils.litToPref("\"aaa\"^^"))); assertEqualsCompact("\"aaa\"@fr-fr", LiteralsUtils.prefToLit(LiteralsUtils.litToPref("\"aaa\"@fr-fr"))); assertEqualsCompact("", LiteralsUtils.prefToLit(LiteralsUtils.litToPref(""))); From f0969fabd1eb467152c0e3b52e0d6793329beb55 Mon Sep 17 00:00:00 2001 From: qaate47 Date: Tue, 15 Nov 2022 14:52:15 +0100 Subject: [PATCH 9/9] add millis profiler to HDTCat and HDTDiff --- .../java/org/rdfhdt/hdt/hdt/HDTSupplier.java | 1 + .../org/rdfhdt/hdt/options/HDTOptions.java | 11 +- .../java/org/rdfhdt/hdt/util/Profiler.java | 209 ++++++++++++++--- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 168 +++++++------- .../rdfhdt/hdt/hdt/impl/HDTDiskImporter.java | 57 +++-- .../java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java | 37 ++- .../org/rdfhdt/hdt/util/ProfilerTest.java | 214 +++++++++++------- 7 files changed, 471 insertions(+), 226 deletions(-) rename {hdt-java-core => hdt-api}/src/main/java/org/rdfhdt/hdt/util/Profiler.java (55%) diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java index 02b5bca3..e19be43f 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java @@ -5,6 +5,7 @@ import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.Profiler; import java.io.IOException; import java.nio.file.Path; diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java index e122d173..1bcc14b0 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java @@ -28,6 +28,7 @@ package org.rdfhdt.hdt.options; import org.rdfhdt.hdt.rdf.RDFFluxStop; +import org.rdfhdt.hdt.util.Profiler; import java.util.Objects; import java.util.function.DoubleSupplier; @@ -216,6 +217,15 @@ default void set(String key, RDFFluxStop fluxStop) { set(key, fluxStop.asConfig()); } + /** + * set a profiler id + * @param key key + * @param profiler profiler + */ + default void set(String key, Profiler profiler) { + set(key, "!" + profiler.getId()); + } + /** * set a long value * @@ -230,5 +240,4 @@ default void set(String key, RDFFluxStop fluxStop) { * @param options options */ void setOptions(String options); - } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java b/hdt-api/src/main/java/org/rdfhdt/hdt/util/Profiler.java similarity index 55% rename from hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java rename to hdt-api/src/main/java/org/rdfhdt/hdt/util/Profiler.java index f99eaec8..12e811d6 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/util/Profiler.java @@ -1,15 +1,11 @@ package org.rdfhdt.hdt.util; -import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTOptionsKeys; -import org.rdfhdt.hdt.util.crc.CRC32; -import org.rdfhdt.hdt.util.crc.CRCInputStream; -import org.rdfhdt.hdt.util.crc.CRCOutputStream; -import org.rdfhdt.hdt.util.io.IOUtil; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; +import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -17,15 +13,31 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; +import java.util.concurrent.atomic.AtomicLong; /** * tool to profile time * * @author Antoine Willerval */ -public class Profiler { +public class Profiler implements AutoCloseable { + private static final AtomicLong PROFILER_IDS = new AtomicLong(); + private static final Map PROFILER = new HashMap<>(); + + /** + * get a non-closed profiler + * + * @param id profiler id + * @return profiler or null if closed or non-existing + */ + public static Profiler getProfilerById(long id) { + return PROFILER.get(id); + } + /** * Read the profiling values from an input path * @@ -35,29 +47,102 @@ public class Profiler { */ public static Profiler readFromDisk(Path inputPath) throws IOException { Profiler p = new Profiler(""); - try (CRCInputStream is = new CRCInputStream(new BufferedInputStream(Files.newInputStream(inputPath)), new CRC32())) { + try (InputStream is = new BufferedInputStream(Files.newInputStream(inputPath))) { for (byte b : HEADER) { if (is.read() != b) { throw new IOException("Missing header for the profiling file!"); } } p.mainSection = p.new Section(is, 0); - if (!is.readCRCAndCheck()) { - throw new IllegalArgumentException("CRC doesn't match when reading the CRC!"); + int checkSum = p.mainSection.computeCheckSum(); + int checkSumRead = (int) readLong(is); + if (checkSumRead != checkSum) { + throw new IOException("the Checksum isn't the same"); } } return p; } + private static long readLong(InputStream is) throws IOException { + byte[] longBuffer = readBuffer(is, 8); + return (longBuffer[0] & 0xFF) + | ((longBuffer[1] & 0xFFL) << 8) + | ((longBuffer[2] & 0xFFL) << 16) + | ((longBuffer[3] & 0xFFL) << 24) + | ((longBuffer[4] & 0xFFL) << 32) + | ((longBuffer[5] & 0xFFL) << 40) + | ((longBuffer[6] & 0xFFL) << 48) + | ((longBuffer[7] & 0xFFL) << 56); + } + + private static void writeLong(OutputStream os, long value) throws IOException { + os.write((byte) (value & 0xFF)); + os.write((byte) ((value >>> 8) & 0xFF)); + os.write((byte) ((value >>> 16) & 0xFF)); + os.write((byte) ((value >>> 24) & 0xFF)); + os.write((byte) ((value >>> 32) & 0xFF)); + os.write((byte) ((value >>> 40) & 0xFF)); + os.write((byte) ((value >>> 48) & 0xFF)); + os.write((byte) ((value >>> 56) & 0xFF)); + } + + private static byte[] readBuffer(InputStream input, int length) throws IOException { + int nRead; + int pos = 0; + byte[] data = new byte[length]; + + while ((nRead = input.read(data, pos, length - pos)) > 0) { + pos += nRead; + } + + if (pos != length) { + throw new EOFException("EOF while reading array from InputStream"); + } + + return data; + } + + /** + * create or load a profiler from the options into a subsection + * + * @param name name + * @param options options + * @param setId set the id after loading (if required) + * @return profiler + */ + public static Profiler createOrLoadSubSection(String name, HDTOptions options, boolean setId) { + // no options, we can't create + if (options == null) { + return new Profiler(name, null); + } + String profiler = options.get(HDTOptionsKeys.PROFILER_KEY); + if (profiler != null && profiler.length() != 0 && profiler.charAt(0) == '!') { + Profiler prof = getProfilerById(Long.parseLong(profiler.substring(1))); + if (prof != null) { + prof.pushSection(name); + prof.deep++; + return prof; + } + } + // no id, not an id + Profiler prof = new Profiler(name, options); + if (setId) { + options.set(HDTOptionsKeys.PROFILER_KEY, prof); + } + return prof; + } + private static final byte[] HEADER = {'H', 'D', 'T', 'P', 'R', 'O', 'F', 'I', 'L', 'E'}; private int maxSize = 0; private final String name; private Section mainSection; private boolean disabled; private Path outputPath; + private final long id; + private int deep = 0; /** - * create a profiler + * create a disabled profiler * * @param name the profiler name */ @@ -72,13 +157,19 @@ public Profiler(String name) { * @param spec spec (nullable) */ public Profiler(String name, HDTOptions spec) { + this.id = PROFILER_IDS.incrementAndGet(); + PROFILER.put(this.id, this); this.name = Objects.requireNonNull(name, "name can't be null!"); if (spec != null) { - disabled = !spec.getBoolean(HDTOptionsKeys.PROFILER_KEY); + String b = spec.get(HDTOptionsKeys.PROFILER_KEY); + disabled = b == null || b.length() == 0 || !(b.charAt(0) == '!' || "true".equalsIgnoreCase(b)); String profilerOutputLocation = spec.get(HDTOptionsKeys.PROFILER_OUTPUT_KEY); if (profilerOutputLocation != null && !profilerOutputLocation.isEmpty()) { outputPath = Path.of(profilerOutputLocation); } + } else { + // no profiling by default + disabled = true; } } @@ -103,6 +194,17 @@ public void pushSection(String name) { getMainSection().pushSection(name, 0); } + /** + * @return profiler id + */ + public long getId() { + return id; + } + + public boolean isDisabled() { + return disabled; + } + /** * complete a section */ @@ -120,7 +222,7 @@ public void popSection() { * stop the profiler without poping sections */ public void stop() { - if (disabled) { + if (disabled || deep != 0) { return; } getMainSection().stop(); @@ -137,7 +239,7 @@ public void reset() { * write the profile into the console */ public void writeProfiling() throws IOException { - if (disabled) { + if (disabled || deep != 0) { return; } getMainSection().writeProfiling("", true); @@ -152,12 +254,13 @@ public void writeProfiling() throws IOException { * @param outputPath output path */ public void writeToDisk(Path outputPath) throws IOException { - try (CRCOutputStream os = new CRCOutputStream(new BufferedOutputStream(Files.newOutputStream(outputPath)), new CRC32())) { + try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(outputPath))) { for (byte b : HEADER) { os.write(b); } - getMainSection().writeSection(os); - os.writeCRC(); + Section mainSection = getMainSection(); + mainSection.writeSection(os); + writeLong(os, mainSection.computeCheckSum()); } } @@ -171,6 +274,16 @@ public Section getMainSection() { return this.mainSection; } + @Override + public void close() { + if (deep == 0) { + PROFILER.remove(getId()); + } else { + deep--; + popSection(); + } + } + /** * a section in the profiling */ @@ -183,7 +296,7 @@ public class Section { Section(String name) { this.name = name; - start = System.nanoTime(); + start = System.currentTimeMillis(); end = start; subSections = new ArrayList<>(); } @@ -195,16 +308,16 @@ public class Section { * @throws IOException io exception */ Section(InputStream is, int deep) throws IOException { - start = VByte.decode(is); - end = VByte.decode(is); + start = readLong(is); + end = readLong(is); - int nameLength = (int) VByte.decode(is); - byte[] nameBytes = IOUtil.readBuffer(is, nameLength, null); + int nameLength = (int) readLong(is); + byte[] nameBytes = readBuffer(is, nameLength); name = new String(nameBytes, StandardCharsets.UTF_8); maxSize = Math.max(name.length() + deep * 2, maxSize); - int subSize = (int) VByte.decode(is); + int subSize = (int) readLong(is); subSections = new ArrayList<>(subSize); for (int i = 0; i < subSize; i++) { subSections.add(new Section(is, deep + 1)); @@ -214,14 +327,14 @@ public class Section { void writeSection(OutputStream os) throws IOException { byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); - VByte.encode(os, start); - VByte.encode(os, end); + writeLong(os, start); + writeLong(os, end); - VByte.encode(os, nameBytes.length); + writeLong(os, nameBytes.length); os.write(nameBytes); List
sub = getSubSections(); - VByte.encode(os, sub.size()); + writeLong(os, sub.size()); for (Section s : sub) { s.writeSection(os); @@ -263,7 +376,7 @@ boolean popSection() { } return false; } else { - end = System.nanoTime(); + end = System.currentTimeMillis(); return true; } } @@ -294,15 +407,40 @@ public int hashCode() { return result; } + @Override + public String toString() { + return "Section{" + + "name='" + name + '\'' + + ", start=" + start + + ", end=" + end + + ", subSections=" + subSections + + ", currentSection=" + currentSection + + '}'; + } + void stop() { if (isRunning()) { currentSection.stop(); } - end = System.nanoTime(); + end = System.currentTimeMillis(); } public long getMillis() { - return (end - start) / 1_000_000L; + return end - start; + } + + /** + * @return start timestamp + */ + public long getStartMillis() { + return start; + } + + /** + * @return end timestamp + */ + public long getEndMillis() { + return end; } void writeProfiling(String prefix, boolean isLast) { @@ -312,5 +450,18 @@ void writeProfiling(String prefix, boolean isLast) { s.writeProfiling(prefix + (isLast ? " " : "| "), i == subSections.size() - 1); } } + + /** + * @return checksum for the profiling section + */ + public int computeCheckSum() { + int result = name.length(); + result = 31 * result + (int) (start ^ (start >>> 32)); + result = 31 * result + (int) (end ^ (end >>> 32)); + for (Section subSection : subSections) { + result = 31 * result ^ subSection.computeCheckSum(); + } + return result; + } } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 9e0631fb..46f77c67 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -321,12 +321,18 @@ public HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, H try (HDT hdt1 = loadOrMapHDT(hdtFileName1, listener, hdtFormat); HDT hdt2 = loadOrMapHDT(hdtFileName2, listener, hdtFormat)) { HDTImpl hdt = new HDTImpl(hdtFormat); - if (hdt1.getDictionary() instanceof MultipleSectionDictionary - && hdt2.getDictionary() instanceof MultipleSectionDictionary) { - hdt.catCustom(location, hdt1, hdt2, listener); - } - else { - hdt.cat(location, hdt1, hdt2, listener); + try (Profiler profiler = Profiler.createOrLoadSubSection("hdtCat", hdtFormat, false)) { + try { + if (hdt1.getDictionary() instanceof MultipleSectionDictionary + && hdt2.getDictionary() instanceof MultipleSectionDictionary) { + hdt.catCustom(location, hdt1, hdt2, listener, profiler); + } else { + hdt.cat(location, hdt1, hdt2, listener, profiler); + } + } finally { + profiler.stop(); + profiler.writeProfiling(); + } } return hdt; } @@ -337,7 +343,9 @@ public HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFor try (HDT hdt1 = loadOrMapHDT(hdtFileName1, listener, hdtFormat); HDT hdt2 = loadOrMapHDT(hdtFileName2, listener, hdtFormat)) { HDTImpl hdt = new HDTImpl(hdtFormat); - hdt.diff(hdt1, hdt2, listener); + try (Profiler profiler = Profiler.createOrLoadSubSection("hdtDiff", hdtFormat, true)) { + hdt.diff(hdt1, hdt2, listener, profiler); + } return hdt; } } @@ -346,8 +354,8 @@ public HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFor protected HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBitmap, HDTOptions hdtFormat, ProgressListener listener) throws IOException { try (HDT hdtOriginal = loadOrMapHDT(hdtFileName, listener, hdtFormat)) { HDTImpl hdt = new HDTImpl(hdtFormat); - try { - hdt.diffBit(location, hdtOriginal, deleteBitmap, listener); + try (Profiler profiler = Profiler.createOrLoadSubSection("hdtDiffBit", hdtFormat, true)) { + hdt.diffBit(location, hdtOriginal, deleteBitmap, listener, profiler); } catch (Throwable t) { try { throw t; @@ -390,84 +398,84 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< Path futureHDTLocation = Optional.ofNullable(hdtFormat.get(HDTOptionsKeys.LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY)).map(Path::of).orElse(null); - Profiler profiler = new Profiler("doHDTCatTree", hdtFormat); - - FluxStopTripleStringIterator it = new FluxStopTripleStringIterator(iterator, fluxStop); - - List files = new ArrayList<>(); - - long gen = 0; - long cat = 0; - - Path hdtStore = basePath.resolve("hdt-store"); - Path hdtCatLocationPath = basePath.resolve("cat"); - String hdtCatLocation = hdtCatLocationPath.toAbsolutePath().toString(); - - Files.createDirectories(hdtStore); - Files.createDirectories(hdtCatLocationPath); - - boolean nextFile; - do { - // generate the hdt - gen++; - profiler.pushSection("generateHDT #" + gen); - PrefixListener il = PrefixListener.of("gen#" + gen, listener); - Path hdtLocation = hdtStore.resolve("hdt-" + gen + ".hdt"); - supplier.doGenerateHDT(it, baseURI, hdtFormat, il, hdtLocation); - il.clearThreads(); - - nextFile = it.hasNextFlux(); - HDTFile hdtFile = new HDTFile(hdtLocation, 1); - profiler.popSection(); - - // merge the generated hdt with each block with enough size - while (!files.isEmpty() && (!nextFile || (files.get(files.size() - 1)).getChunks() <= hdtFile.getChunks())) { - HDTFile lastHDTFile = files.remove(files.size() - 1); - cat++; - profiler.pushSection("catHDT #" + cat); - PrefixListener ilc = PrefixListener.of("cat#" + cat, listener); - Path hdtCatFileLocation = hdtStore.resolve("hdtcat-" + cat + ".hdt"); - try (HDT abcat = HDTManager.catHDT( - hdtCatLocation, - lastHDTFile.getHdtFile().toAbsolutePath().toString(), - hdtFile.getHdtFile().toAbsolutePath().toString(), - hdtFormat, ilc)) { - abcat.saveToHDT(hdtCatFileLocation.toAbsolutePath().toString(), ilc); - } - ilc.clearThreads(); - // delete previous chunks - Files.delete(lastHDTFile.getHdtFile()); - Files.delete(hdtFile.getHdtFile()); - // note the new hdt file and the number of chunks - hdtFile = new HDTFile(hdtCatFileLocation, lastHDTFile.getChunks() + hdtFile.getChunks()); + try (Profiler profiler = Profiler.createOrLoadSubSection("doHDTCatTree", hdtFormat, true)) { + FluxStopTripleStringIterator it = new FluxStopTripleStringIterator(iterator, fluxStop); + + List files = new ArrayList<>(); + + long gen = 0; + long cat = 0; + + Path hdtStore = basePath.resolve("hdt-store"); + Path hdtCatLocationPath = basePath.resolve("cat"); + String hdtCatLocation = hdtCatLocationPath.toAbsolutePath().toString(); + Files.createDirectories(hdtStore); + Files.createDirectories(hdtCatLocationPath); + + boolean nextFile; + do { + // generate the hdt + gen++; + profiler.pushSection("generateHDT #" + gen); + PrefixListener il = PrefixListener.of("gen#" + gen, listener); + Path hdtLocation = hdtStore.resolve("hdt-" + gen + ".hdt"); + supplier.doGenerateHDT(it, baseURI, hdtFormat, il, hdtLocation); + il.clearThreads(); + + nextFile = it.hasNextFlux(); + HDTFile hdtFile = new HDTFile(hdtLocation, 1); profiler.popSection(); - } - files.add(hdtFile); - } while (nextFile); - listener.notifyProgress(100, "done, loading HDT"); + // merge the generated hdt with each block with enough size + while (!files.isEmpty() && (!nextFile || (files.get(files.size() - 1)).getChunks() <= hdtFile.getChunks())) { + HDTFile lastHDTFile = files.remove(files.size() - 1); + cat++; + profiler.pushSection("catHDT #" + cat); + PrefixListener ilc = PrefixListener.of("cat#" + cat, listener); + Path hdtCatFileLocation = hdtStore.resolve("hdtcat-" + cat + ".hdt"); + try (HDT abcat = HDTManager.catHDT( + hdtCatLocation, + lastHDTFile.getHdtFile().toAbsolutePath().toString(), + hdtFile.getHdtFile().toAbsolutePath().toString(), + hdtFormat, ilc)) { + abcat.saveToHDT(hdtCatFileLocation.toAbsolutePath().toString(), ilc); + } + ilc.clearThreads(); + // delete previous chunks + Files.delete(lastHDTFile.getHdtFile()); + Files.delete(hdtFile.getHdtFile()); + // note the new hdt file and the number of chunks + hdtFile = new HDTFile(hdtCatFileLocation, lastHDTFile.getChunks() + hdtFile.getChunks()); + + profiler.popSection(); + } + files.add(hdtFile); + } while (nextFile); - Path hdtFile = files.get(0).hdtFile; + listener.notifyProgress(100, "done, loading HDT"); - assert files.get(0).getChunks() == gen; - assert cat < gen; + Path hdtFile = files.get(0).hdtFile; - // if a future HDT location has been asked, move to it and map the HDT - if (futureHDTLocation != null) { - Files.createDirectories(futureHDTLocation.toAbsolutePath().getParent()); - Files.deleteIfExists(futureHDTLocation); - Files.move(hdtFile, futureHDTLocation); - return HDTManager.mapHDT(futureHDTLocation.toAbsolutePath().toString()); - } + assert files.get(0).getChunks() == gen; + assert cat < gen; - // if no future location has been asked, load the HDT and delete it after - try { - return HDTManager.loadHDT(hdtFile.toAbsolutePath().toString()); - } finally { - Files.delete(hdtFile); - profiler.stop(); - profiler.writeProfiling(); + try { + // if a future HDT location has been asked, move to it and map the HDT + if (futureHDTLocation != null) { + Files.createDirectories(futureHDTLocation.toAbsolutePath().getParent()); + Files.deleteIfExists(futureHDTLocation); + Files.move(hdtFile, futureHDTLocation); + return HDTManager.mapHDT(futureHDTLocation.toAbsolutePath().toString()); + } + + // if no future location has been asked, load the HDT and delete it after + return HDTManager.loadHDT(hdtFile.toAbsolutePath().toString()); + } finally { + Files.deleteIfExists(hdtFile); + profiler.stop(); + profiler.writeProfiling(); + } } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java index 72a756dd..eab06663 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTDiskImporter.java @@ -10,7 +10,6 @@ import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.hdt.impl.diskimport.CompressTripleMapper; import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; -import org.rdfhdt.hdt.hdt.impl.diskimport.SectionCompressor; import org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult; import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.iterator.utils.AsyncIteratorFetcher; @@ -130,28 +129,36 @@ public HDTDiskImporter(HDTOptions hdtFormat, ProgressListener progressListener, // location of the future HDT file, do not set to create the HDT in memory while mergin futureHDTLocation = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY); - profiler = new Profiler("doGenerateHDTDisk", hdtFormat); - if (baseNameOpt == null || baseNameOpt.isEmpty()) { - basePath = CloseSuppressPath.of(Files.createTempDirectory("hdt-java-generate-disk")); - } else { - basePath = CloseSuppressPath.of(baseNameOpt); - } - basePath.closeWithDeleteRecurse(); - mapHDT = futureHDTLocation != null && !futureHDTLocation.isEmpty(); - // debug the build strategy - debugHDTBuilding = hdtFormat.getBoolean("debug.disk.build"); - - // create working directory - basePath.mkdirs(); - - if (!mapHDT) { - // using default implementation - hdt = new HDTImpl(hdtFormat); - } else { - // using map implementation - hdt = new WriteHDTImpl(hdtFormat, basePath.resolve("maphdt"), bufferSize); + profiler = Profiler.createOrLoadSubSection("doGenerateHDTDisk", hdtFormat, true); + try { + if (baseNameOpt == null || baseNameOpt.isEmpty()) { + basePath = CloseSuppressPath.of(Files.createTempDirectory("hdt-java-generate-disk")); + } else { + basePath = CloseSuppressPath.of(baseNameOpt); + } + basePath.closeWithDeleteRecurse(); + mapHDT = futureHDTLocation != null && !futureHDTLocation.isEmpty(); + // debug the build strategy + debugHDTBuilding = hdtFormat.getBoolean("debug.disk.build"); + + // create working directory + basePath.mkdirs(); + + if (!mapHDT) { + // using default implementation + hdt = new HDTImpl(hdtFormat); + } else { + // using map implementation + hdt = new WriteHDTImpl(hdtFormat, basePath.resolve("maphdt"), bufferSize); + } + hdt.setBaseUri(baseURI); + } catch (Throwable t) { + try { + throw t; + } finally { + profiler.close(); + } } - hdt.setBaseUri(baseURI); } /** @@ -360,7 +367,11 @@ public void close() throws IOException { profiler.writeProfiling(); listener.notifyProgress(100, "Clearing disk"); } finally { - basePath.close(); + try { + basePath.close(); + } finally { + profiler.close(); + } } } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java index 83663093..d2f1831a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java @@ -77,6 +77,7 @@ import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorDiff; import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorMapDiff; import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.Profiler; import org.rdfhdt.hdt.util.StopWatch; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; @@ -477,11 +478,12 @@ public boolean isMapped() { * @param hdt2 hdt2 * @param listener listener */ - public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { + public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener, Profiler profiler) throws IOException { if (listener != null) { listener.notifyProgress(0, "Generating dictionary"); } try (FourSectionDictionaryCat dictionaryCat = new FourSectionDictionaryCat(location)) { + profiler.pushSection("catdict"); dictionaryCat.cat(hdt1.getDictionary(), hdt2.getDictionary(), listener); ControlInfo ci2 = new ControlInformation(); //map the generated dictionary @@ -497,13 +499,19 @@ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) this.dictionary.close(); } this.dictionary = dictionary; + + profiler.popSection(); + profiler.pushSection("cattriples"); + if (listener != null) { listener.notifyProgress(0, "Generating triples"); } BitmapTriplesIteratorCat it = new BitmapTriplesIteratorCat(hdt1.getTriples(), hdt2.getTriples(), dictionaryCat); BitmapTriplesCat bitmapTriplesCat = new BitmapTriplesCat(location); bitmapTriplesCat.cat(it, listener); + profiler.popSection(); } + profiler.pushSection("Clean and map"); //Delete the mappings since they are not necessary anymore Files.delete( Paths.get(location+"P1")); Files.delete( Paths.get(location+"P1"+"Types")); @@ -546,6 +554,7 @@ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) if (rawSize1 != -1 && rawSize2 != -1) { getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, String.valueOf(rawSize1 + rawSize2)); } + profiler.popSection(); } public static long getRawSize(Header header) { @@ -564,12 +573,14 @@ public static long getRawSize(Header header) { } } - public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { + public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener listener, Profiler profiler) throws IOException { if (listener != null) { listener.notifyProgress(0, "Generating dictionary"); } try (DictionaryCat dictionaryCat = new MultipleSectionDictionaryCat(location)) { + profiler.pushSection("catdict"); dictionaryCat.cat(hdt1.getDictionary(), hdt2.getDictionary(), listener); + //map the generated dictionary ControlInfo ci2 = new ControlInformation(); try (CountInputStream fis = new CountInputStream(new BufferedInputStream(new FileInputStream(location + "dictionary")))) { @@ -583,6 +594,8 @@ public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener list dictionary.mapFromFile(fis, new File(location + "dictionary"), null); this.dictionary = dictionary; } + profiler.popSection(); + profiler.pushSection("cattriples"); if (listener != null) { listener.notifyProgress(0, "Generating triples"); @@ -590,7 +603,9 @@ public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener list BitmapTriplesIteratorCat it = new BitmapTriplesIteratorCat(hdt1.getTriples(), hdt2.getTriples(), dictionaryCat); BitmapTriplesCat bitmapTriplesCat = new BitmapTriplesCat(location); bitmapTriplesCat.cat(it,listener); + profiler.popSection(); } + profiler.pushSection("Clean and map"); //Delete the mappings since they are not necessary anymore int countSubSections = 0; for (CharSequence datatype : hdt1.getDictionary().getAllObjects().keySet()) { @@ -653,19 +668,23 @@ public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener list if (rawSize1 != -1 && rawSize2 != -1) { getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, String.valueOf(rawSize1 + rawSize2)); } + profiler.popSection(); } - public void diff(HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { + public void diff(HDT hdt1, HDT hdt2, ProgressListener listener, Profiler profiler) throws IOException { ModifiableBitmap bitmap = BitmapFactory.createRWBitmap(hdt1.getTriples().getNumberOfElements()); BitmapTriplesIteratorDiff iterator = new BitmapTriplesIteratorDiff(hdt1, hdt2, bitmap); + profiler.pushSection("fill bitmap"); iterator.fillBitmap(); - diffBit(getHDTFileName(), hdt1, bitmap, listener); + profiler.popSection(); + diffBit(getHDTFileName(), hdt1, bitmap, listener, profiler); } - public void diffBit(String location, HDT hdt, Bitmap deleteBitmap, ProgressListener listener) throws IOException { + public void diffBit(String location, HDT hdt, Bitmap deleteBitmap, ProgressListener listener, Profiler profiler) throws IOException { IntermediateListener il = new IntermediateListener(listener); log.debug("Generating Dictionary..."); il.notifyProgress(0, "Generating Dictionary..."); + profiler.pushSection("diffdict"); IteratorTripleID hdtIterator = hdt.getTriples().searchAll(); DictionaryEntriesDiff iter = DictionaryEntriesDiff.createForType(hdt.getDictionary(), hdt, deleteBitmap, hdtIterator); @@ -687,7 +706,12 @@ public void diffBit(String location, HDT hdt, Bitmap deleteBitmap, ProgressListe dictionary.mapFromFile(fis, new File(location + "dictionary"), null); this.dictionary = dictionary; } + profiler.popSection(); + log.debug("Generating Triples..."); + + profiler.pushSection("difftriples"); + il.notifyProgress(40, "Generating Triples..."); // map the triples based on the new dictionary BitmapTriplesIteratorMapDiff mapIter = new BitmapTriplesIteratorMapDiff(hdt, deleteBitmap, diff); @@ -696,6 +720,8 @@ public void diffBit(String location, HDT hdt, Bitmap deleteBitmap, ProgressListe triples.load(mapIter, listener); this.triples = triples; } + profiler.popSection(); + profiler.pushSection("Clean and map"); log.debug("Clear data..."); il.notifyProgress(80, "Clear data..."); @@ -735,5 +761,6 @@ public void diffBit(String location, HDT hdt, Bitmap deleteBitmap, ProgressListe this.populateHeaderStructure(hdt.getBaseURI()); log.debug("Diff completed."); il.notifyProgress(100, "Diff completed..."); + profiler.popSection(); } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/ProfilerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/ProfilerTest.java index b00cf010..4b22748b 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/ProfilerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/ProfilerTest.java @@ -3,6 +3,7 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; +import org.rdfhdt.hdt.options.HDTOptionsBase; import java.io.IOException; import java.nio.file.Path; @@ -17,123 +18,160 @@ public class ProfilerTest { public void ioTest() throws IOException, InterruptedException { Path root = tempDir.getRoot().toPath(); - Profiler profiler = new Profiler("test"); - profiler.pushSection("tests1"); - { - profiler.pushSection("tests1s1"); + try (Profiler profiler = new Profiler("test")) { + profiler.setDisabled(false); + profiler.pushSection("tests1"); { - Thread.sleep(25L); - } - profiler.popSection(); - - profiler.pushSection("tests1s2"); - { - Thread.sleep(5L); - } - profiler.popSection(); + profiler.pushSection("tests1s1"); + { + Thread.sleep(25L); + } + profiler.popSection(); - profiler.pushSection("tests1s3"); - { - profiler.pushSection("tests1s3s1"); + profiler.pushSection("tests1s2"); { Thread.sleep(5L); } profiler.popSection(); + + profiler.pushSection("tests1s3"); + { + profiler.pushSection("tests1s3s1"); + { + Thread.sleep(5L); + } + profiler.popSection(); + } + profiler.popSection(); + } + profiler.popSection(); + profiler.pushSection("tests2"); + { + Thread.sleep(5L); } profiler.popSection(); - } - profiler.popSection(); - profiler.pushSection("tests2"); - { - Thread.sleep(5L); - } - profiler.popSection(); - - profiler.stop(); - profiler.writeProfiling(); - Path profiling = root.resolve("profiling"); - profiler.writeToDisk(profiling); + profiler.stop(); + profiler.writeProfiling(); - Profiler p2 = Profiler.readFromDisk(profiling); + Path profiling = root.resolve("profiling"); + profiler.writeToDisk(profiling); - assertEquals(profiler.getMainSection(), p2.getMainSection()); + try (Profiler p2 = Profiler.readFromDisk(profiling)) { + assertEquals(profiler.getMainSection(), p2.getMainSection()); + } + } } @Test public void structTest() throws InterruptedException { - Profiler profiler = new Profiler("test"); - profiler.pushSection("tests1"); - { - profiler.pushSection("tests1s1"); + try (Profiler profiler = new Profiler("test")) { + profiler.setDisabled(false); + profiler.pushSection("tests1"); { - Thread.sleep(25L); + profiler.pushSection("tests1s1"); + { + Thread.sleep(25L); + } + profiler.popSection(); + + profiler.pushSection("tests1s2"); + { + Thread.sleep(5L); + } + profiler.popSection(); + + profiler.pushSection("tests1s3"); + { + profiler.pushSection("tests1s3s1"); + { + Thread.sleep(5L); + } + profiler.popSection(); + } + profiler.popSection(); } profiler.popSection(); - - profiler.pushSection("tests1s2"); + profiler.pushSection("tests2"); { Thread.sleep(5L); } profiler.popSection(); - profiler.pushSection("tests1s3"); - { - profiler.pushSection("tests1s3s1"); - { - Thread.sleep(5L); - } - profiler.popSection(); + profiler.stop(); + + Profiler.Section test = profiler.getMainSection(); + assertEquals("test", test.getName()); + List testSub = test.getSubSections(); + assertEquals(2, testSub.size()); + + Profiler.Section tests1 = testSub.get(0); + assertEquals("tests1", tests1.getName()); + List tests1Sub = tests1.getSubSections(); + assertEquals(3, tests1Sub.size()); + + Profiler.Section tests1s1 = tests1Sub.get(0); + assertEquals("tests1s1", tests1s1.getName()); + List tests1s1Sub = tests1s1.getSubSections(); + assertEquals(0, tests1s1Sub.size()); + + Profiler.Section tests1s2 = tests1Sub.get(1); + assertEquals("tests1s2", tests1s2.getName()); + List tests1s2Sub = tests1s2.getSubSections(); + assertEquals(0, tests1s2Sub.size()); + + Profiler.Section tests1s3 = tests1Sub.get(2); + assertEquals("tests1s3", tests1s3.getName()); + List tests1s3Sub = tests1s3.getSubSections(); + assertEquals(1, tests1s3Sub.size()); + + Profiler.Section tests1s3s1 = tests1s3Sub.get(0); + assertEquals("tests1s3s1", tests1s3s1.getName()); + assertEquals(0, tests1s3s1.getSubSections().size()); + + Profiler.Section tests2 = testSub.get(1); + assertEquals("tests2", tests2.getName()); + assertEquals(0, tests2.getSubSections().size()); + } + } + + @Test + public void loadBack() { + long id; + try (Profiler prof = new Profiler("test")) { + prof.setDisabled(false); + id = prof.getId(); + try (Profiler p2 = Profiler.getProfilerById(id)) { + assertNotNull(p2); + assertEquals(prof, p2); } - profiler.popSection(); } - profiler.popSection(); - profiler.pushSection("tests2"); - { - Thread.sleep(5L); + assertNull(Profiler.getProfilerById(id)); + } + + @Test + public void loadBackOpt() { + HDTOptionsBase opt = new HDTOptionsBase(); + long id; + try (Profiler prof = Profiler.createOrLoadSubSection("test", opt, true)) { + id = prof.getId(); + Profiler p2 = Profiler.getProfilerById(id); + assertNotNull(p2); + assertEquals(prof, p2); + + try (Profiler p3 = Profiler.createOrLoadSubSection("test2", opt, true)) { + assertNotNull(p3); + assertEquals(prof, p3); + } } - profiler.popSection(); - - profiler.stop(); - - Profiler.Section test = profiler.getMainSection(); - assertEquals("test", test.getName()); - List testSub = test.getSubSections(); - assertEquals(2, testSub.size()); - - Profiler.Section tests1 = testSub.get(0); - assertEquals("tests1", tests1.getName()); - List tests1Sub = tests1.getSubSections(); - assertEquals(3, tests1Sub.size()); - - Profiler.Section tests1s1 = tests1Sub.get(0); - assertEquals("tests1s1", tests1s1.getName()); - List tests1s1Sub = tests1s1.getSubSections(); - assertEquals(0, tests1s1Sub.size()); - - Profiler.Section tests1s2 = tests1Sub.get(1); - assertEquals("tests1s2", tests1s2.getName()); - List tests1s2Sub = tests1s2.getSubSections(); - assertEquals(0, tests1s2Sub.size()); - - Profiler.Section tests1s3 = tests1Sub.get(2); - assertEquals("tests1s3", tests1s3.getName()); - List tests1s3Sub = tests1s3.getSubSections(); - assertEquals(1, tests1s3Sub.size()); - - Profiler.Section tests1s3s1 = tests1s3Sub.get(0); - assertEquals("tests1s3s1", tests1s3s1.getName()); - assertEquals(0, tests1s3s1.getSubSections().size()); - - Profiler.Section tests2 = testSub.get(1); - assertEquals("tests2", tests2.getName()); - assertEquals(0, tests2.getSubSections().size()); + assertNull(Profiler.getProfilerById(id)); } @Test(expected = IllegalArgumentException.class) public void popTest() { - Profiler p = new Profiler(""); - - p.popSection(); + try (Profiler p = new Profiler("")) { + p.setDisabled(false); + p.popSection(); + } } }