From d39af554524755cdd0f216ffddc9e8eb2458b3e6 Mon Sep 17 00:00:00 2001 From: qaate47 Date: Tue, 22 Nov 2022 10:12:25 +0100 Subject: [PATCH 1/2] Implement k-HDTCat --- .../java/org/rdfhdt/hdt/hdt/HDTManager.java | 14 + .../org/rdfhdt/hdt/options/HDTOptions.java | 20 + .../rdfhdt/hdt/options/HDTOptionsKeys.java | 16 + .../java/org/rdfhdt/hdt/triples/TripleID.java | 11 +- hdt-java-cli/bin/javaenv.bat | 1 + hdt-java-cli/bin/javaenv.sh | 6 + .../java/org/rdfhdt/hdt/tools/HDTCat.java | 98 +- .../java/org/rdfhdt/hdt/tools/HDTVerify.java | 456 ++++++---- .../rdfhdt/hdt/util/listener/ColorTool.java | 24 +- .../listener/MultiThreadListenerConsole.java | 45 +- .../hdt/dictionary/DictionaryFactory.java | 63 ++ .../rdfhdt/hdt/dictionary/DictionaryKCat.java | 56 ++ .../impl/WriteFourSectionDictionary.java | 14 + .../impl/WriteMultipleSectionDictionary.java | 16 + .../impl/kcat/FourSectionDictionaryKCat.java | 67 ++ .../impl/kcat/GroupBySubjectMapIterator.java | 359 ++++++++ .../hdt/dictionary/impl/kcat/KCatImpl.java | 226 +++++ .../hdt/dictionary/impl/kcat/KCatMerger.java | 861 ++++++++++++++++++ .../impl/kcat/LocatedIndexedNode.java | 35 + .../kcat/MultipleSectionDictionaryKCat.java | 80 ++ .../impl/section/WriteDictionarySection.java | 8 +- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 21 +- .../java/org/rdfhdt/hdt/hdt/impl/HDTBase.java | 22 + .../java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java | 16 - .../org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java | 18 +- .../hdt/iterator/utils/MapIterator.java | 17 + .../utils/MergeExceptionIterator.java | 99 +- .../rdfhdt/hdt/options/HDTOptionsBase.java | 6 +- .../rdfhdt/hdt/options/HideHDTOptions.java | 7 + .../org/rdfhdt/hdt/triples/IndexedNode.java | 15 +- .../rdfhdt/hdt/util/concurrent/SyncSeq.java | 32 + .../java/org/rdfhdt/hdt/util/io/Closer.java | 63 ++ .../dictionary/impl/kcat/KCatMergerTest.java | 318 +++++++ .../org/rdfhdt/hdt/hdt/HDTManagerTest.java | 9 +- .../org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java | 2 +- .../util/LargeFakeDataSetStreamSupplier.java | 27 +- hdt-java-package/bin/javaenv.bat | 1 + hdt-java-package/bin/javaenv.sh | 5 + 38 files changed, 2891 insertions(+), 263 deletions(-) create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryKCat.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/FourSectionDictionaryKCat.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/GroupBySubjectMapIterator.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatImpl.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMerger.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/LocatedIndexedNode.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/MultipleSectionDictionaryKCat.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncSeq.java create mode 100644 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/Closer.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMergerTest.java diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java index 78c1127d..56e4d203 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java @@ -4,6 +4,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Iterator; +import java.util.List; import org.rdfhdt.hdt.compact.bitmap.Bitmap; import org.rdfhdt.hdt.enums.CompressionType; @@ -461,6 +462,18 @@ public static TripleWriter getHDTWriter(String outFile, String baseURI, HDTOptio public static HDT catHDT(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException { return HDTManager.getInstance().doHDTCat(location, hdtFileName1, hdtFileName2, hdtFormat, listener); } + + /** + * Create an HDT file from HDT files by joining the triples. + * @param hdtFileNames hdt file names + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @throws IOException when the file cannot be found + * @return HDT + */ + public static HDT catHDT(List hdtFileNames, HDTOptions hdtFormat, ProgressListener listener) throws IOException { + return HDTManager.getInstance().doHDTCat(hdtFileNames, hdtFormat, listener); + } /** * Create a new HDT by removing from hdt1 the triples of hdt2. * @param hdtFileName1 First hdt file name @@ -561,6 +574,7 @@ public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator hdtFileNames, HDTOptions hdtFormat, ProgressListener listener) throws IOException; protected abstract HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException; protected abstract HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBitmap, HDTOptions hdtFormat, ProgressListener listener) throws IOException; protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, String filename, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java index 1bcc14b0..3647b129 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptions.java @@ -27,10 +27,12 @@ package org.rdfhdt.hdt.options; +import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.rdf.RDFFluxStop; import org.rdfhdt.hdt.util.Profiler; import java.util.Objects; +import java.util.Set; import java.util.function.DoubleSupplier; import java.util.function.LongSupplier; import java.util.function.Supplier; @@ -55,6 +57,10 @@ public interface HDTOptions { */ String get(String key); + default Set getKeys() { + throw new NotImplementedException(); + } + /** * get a value * @@ -86,6 +92,20 @@ default String get(String key, Supplier defaultValue) { default boolean getBoolean(String key) { return "true".equalsIgnoreCase(get(key)); } + /** + * get a boolean + * + * @param key key + * @param defaultValue default value + * @return boolean or false if the value isn't defined + */ + default boolean getBoolean(String key, boolean defaultValue) { + String v = get(key); + if (v == null) { + return defaultValue; + } + return "true".equalsIgnoreCase(v); + } /** * get a double diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java index b4119996..a795da1c 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java @@ -271,6 +271,22 @@ public class HDTOptionsKeys { @Value(key = DICTIONARY_TYPE_KEY, desc = "Multi section dictionary") public static final String DICTIONARY_TYPE_VALUE_MULTI_OBJECTS = "dictionaryMultiObj"; + /** + * Location of the HDTCat temp files + */ + @Key(type = Key.Type.PATH, desc = "Location of the HDTCat temp files") + public static final String HDTCAT_LOCATION = "hdtcat.location"; + /** + * Location of the HDTCat hdt after the loading + */ + @Key(type = Key.Type.PATH, desc = "Location of the HDTCat hdt after the loading") + public static final String HDTCAT_FUTURE_LOCATION = "hdtcat.location.future"; + /** + * Delete the HDTCat temp files directory after HDTCat + */ + @Key(type = Key.Type.BOOLEAN, desc = "Delete the HDTCat temp files directory after HDTCat, default to true") + public static final String HDTCAT_DELETE_LOCATION = "hdtcat.deleteLocation"; + // use tree-map to have a better order private static final Map OPTION_MAP = new TreeMap<>(); diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleID.java b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleID.java index f21b9097..fc25c0e1 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleID.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleID.java @@ -37,7 +37,7 @@ * TripleID holds a triple using Long IDs * */ -public final class TripleID implements Comparable, Serializable { +public final class TripleID implements Comparable, Serializable, Cloneable { private static final long serialVersionUID = -4685524566493494912L; private long subject; @@ -255,6 +255,15 @@ public boolean equals(Object o) { return !( subject!=other.subject || predicate!=other.predicate || object!=other.object ); } + @Override + public TripleID clone() { + try { + return (TripleID) super.clone(); + } catch (CloneNotSupportedException e) { + throw new AssertionError(e); + } + } + @Override public int hashCode() { return (int) (subject * 13 + predicate * 17 + object * 31); diff --git a/hdt-java-cli/bin/javaenv.bat b/hdt-java-cli/bin/javaenv.bat index d0fb3032..4cbc8db9 100644 --- a/hdt-java-cli/bin/javaenv.bat +++ b/hdt-java-cli/bin/javaenv.bat @@ -1,5 +1,6 @@ set JAVAOPTIONS=-Xmx1G set JAVACMD=java +set RDFHDT_COLOR=false set JAVACP="%~dp0\..\target;%~dp0\..\target\classes;%~dp0\..\target\dependency\*.jar;. diff --git a/hdt-java-cli/bin/javaenv.sh b/hdt-java-cli/bin/javaenv.sh index b5ba719b..d0d329f3 100755 --- a/hdt-java-cli/bin/javaenv.sh +++ b/hdt-java-cli/bin/javaenv.sh @@ -21,7 +21,13 @@ else JAVA="$JAVA_HOME/bin/java -server" fi +# Set HDT Color options, set to true to allow color +if [ "$RDFHDT_COLOR" = "" ] ; then + export RDFHDT_COLOR="false" +fi + # Set Java options if [ "$JAVA_OPTIONS" = "" ] ; then JAVA_OPTIONS="-Xmx1g" fi + diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTCat.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTCat.java index 23184470..ba0736b8 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTCat.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTCat.java @@ -24,19 +24,24 @@ import com.beust.jcommander.internal.Lists; import org.apache.commons.io.FileUtils; -import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.hdt.HDTVersion; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.util.StopWatch; +import org.rdfhdt.hdt.util.listener.ColorTool; +import org.rdfhdt.hdt.util.listener.MultiThreadListenerConsole; import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; +import java.util.stream.Collectors; /** * @author Dennis Diefenbach @@ -44,11 +49,9 @@ */ public class HDTCat implements ProgressListener { - public String hdtInput1; - public String hdtInput2; - public String hdtOutput; + private ColorTool colorTool; - @Parameter(description = " ") + @Parameter(description = "+ ") public List parameters = Lists.newArrayList(); @Parameter(names = "-options", description = "HDT Conversion options (override those of config file)") @@ -57,6 +60,9 @@ public class HDTCat implements ProgressListener { @Parameter(names = "-config", description = "Conversion config file") public String configFile; + @Parameter(names = "-kcat", description = "Use KCat algorithm, default if the count of input HDTs isn't 2") + public boolean kcat; + @Parameter(names = "-index", description = "Generate also external indices to solve all queries") public boolean generateIndex; @@ -66,8 +72,19 @@ public class HDTCat implements ProgressListener { @Parameter(names = "-quiet", description = "Do not show progress of the conversion") public boolean quiet; - public void execute() throws IOException { + @Parameter(names = "-color", description = "Print using color (if available)") + public boolean color; + + private HDT cat(String location, HDTOptions spec, ProgressListener listener) throws IOException{ + if (kcat) { + return HDTManager.catHDT(parameters.subList(0, parameters.size() - 1), spec, listener); + } else { + return HDTManager.catHDT(location, parameters.get(0), parameters.get(1), spec, listener); + } + } + + public void execute() throws IOException { HDTSpecification spec; if(configFile!=null) { spec = new HDTSpecification(configFile); @@ -78,35 +95,50 @@ public void execute() throws IOException { spec.setOptions(options); } + String hdtOutput = parameters.get(parameters.size() - 1); File file = new File(hdtOutput); - File theDir = new File(file.getAbsolutePath()+"_tmp"); + + String locationOpt = spec.get(HDTOptionsKeys.HDTCAT_LOCATION); + + if (locationOpt == null) { + locationOpt = file.getAbsolutePath()+"_tmp"; + spec.set(HDTOptionsKeys.HDTCAT_LOCATION, locationOpt); + } + + File theDir = new File(locationOpt); Files.createDirectories(theDir.toPath()); String location = theDir.getAbsolutePath()+"/"; - try (HDT hdt = HDTManager.catHDT(location,hdtInput1, hdtInput2 , spec,this)) { + ProgressListener listenerConsole = + !quiet ? (kcat ? new MultiThreadListenerConsole(color) : this) + : null; + StopWatch startCat = new StopWatch(); + try (HDT hdt = cat(location, spec, listenerConsole)) { + colorTool.logValue("Files cat in .......... ", startCat.stopAndShow(), true); + assert hdt != null; // Show Basic stats if(!quiet){ - System.out.println("Total Triples: "+hdt.getTriples().getNumberOfElements()); - System.out.println("Different subjects: "+hdt.getDictionary().getNsubjects()); - System.out.println("Different predicates: "+hdt.getDictionary().getNpredicates()); - System.out.println("Different objects: "+hdt.getDictionary().getNobjects()); - System.out.println("Common Subject/Object:"+hdt.getDictionary().getNshared()); + colorTool.logValue("Total Triples ......... ", "" + hdt.getTriples().getNumberOfElements()); + colorTool.logValue("Different subjects .... ", "" + hdt.getDictionary().getNsubjects()); + colorTool.logValue("Different predicates .. ", "" + hdt.getDictionary().getNpredicates()); + colorTool.logValue("Different objects ..... ", "" + hdt.getDictionary().getNobjects()); + colorTool.logValue("Common Subject/Object . ", "" + hdt.getDictionary().getNshared()); } // Dump to HDT file StopWatch sw = new StopWatch(); hdt.saveToHDT(hdtOutput, this); - System.out.println("HDT saved to file in: "+sw.stopAndShow()); - Files.delete(Paths.get(location+"dictionary")); - Files.delete(Paths.get(location+"triples")); + colorTool.logValue("HDT saved to file in .. ", sw.stopAndShow()); + Files.deleteIfExists(Path.of(location + "dictionary")); + Files.deleteIfExists(Path.of(location+"triples")); FileUtils.deleteDirectory(theDir); // Generate index and dump it to .hdt.index file sw.reset(); - if(generateIndex) { + if (generateIndex) { HDTManager.indexedHDT(hdt,this); - System.out.println("Index generated and saved in: "+sw.stopAndShow()); + colorTool.logValue("Index generated and saved in ", sw.stopAndShow()); } } @@ -124,29 +156,31 @@ public void notifyProgress(float level, String message) { } } - @SuppressWarnings("deprecation") public static void main(String[] args) throws Throwable { HDTCat hdtCat = new HDTCat(); - System.out.println("Welcome to hdtCat!"); - System.out.println("This tool was developed by Dennis Diefenbach and Jośe M. Giḿenez-Garćıa"); - JCommander com = new JCommander(hdtCat, args); + JCommander com = new JCommander(hdtCat); + com.parse(args); com.setProgramName("hdtCat"); + hdtCat.colorTool = new ColorTool(hdtCat.color, hdtCat.quiet); - if(hdtCat.parameters.size()==3) { - hdtCat.hdtInput1 = hdtCat.parameters.get(0); - hdtCat.hdtInput2 = hdtCat.parameters.get(1); - hdtCat.hdtOutput = hdtCat.parameters.get(2); - } else if (showVersion){ - System.out.println(HDTVersion.get_version_string(".")); + hdtCat.colorTool.log("Welcome to hdtCat!"); + hdtCat.colorTool.log("This tool was developed by Dennis Diefenbach and Jośe M. Giḿenez-Garćıa"); + + if (showVersion) { + hdtCat.colorTool.log(HDTVersion.get_version_string(".")); System.exit(0); - } - else{ + } else if (hdtCat.parameters.size() > 3) { + // force k-cat if we have more than 2 HDTs to cat + hdtCat.kcat = true; + } else if (hdtCat.parameters.size() < 3) { com.usage(); System.exit(1); } - System.out.println("Cat "+ hdtCat.hdtInput1+" and "+ hdtCat.hdtInput2+" to "+ hdtCat.hdtOutput); - + hdtCat.colorTool.log("Cat " + hdtCat.parameters.stream() + .limit(hdtCat.parameters.size() - 1) + .collect(Collectors.joining(", ")) + + " to " + hdtCat.parameters.get(hdtCat.parameters.size() - 1)); hdtCat.execute(); } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java index dc16d93d..657c105e 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java @@ -5,179 +5,313 @@ import com.beust.jcommander.internal.Lists; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.impl.MultipleBaseDictionary; +import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.ColorTool; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.MultiThreadListenerConsole; import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.ReplazableString; import java.io.IOException; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; public class HDTVerify { - private HDTVerify() { - } - - @Parameter(description = "") - public List parameters = Lists.newArrayList(); - - @Parameter(names = "-unicode", description = "Ignore UNICODE order") - public boolean unicode; - - @Parameter(names = "-color", description = "Print using color (if available)") - public boolean color; - - @Parameter(names = "-binary", description = "Print binaries of the string in case of signum error") - public boolean binary; - - @Parameter(names = "-quiet", description = "Do not show progress of the conversion") - public boolean quiet; - - @Parameter(names = "-load", description = "Load the HDT in memory for faster results (might be impossible for large a HDT)") - public boolean load; - - public ColorTool colorTool; - - private HDT loadOrMap(String file) throws IOException { - return load ? HDTManager.loadHDT(file) : HDTManager.mapHDT(file); - } - - private void print(byte[] arr) { - for (byte b : arr) { - System.out.printf("%02X ", b); - } - System.out.println(); - } - - private void print(CharSequence seq) { - if (seq instanceof CompactString) { - CompactString cs1 = (CompactString) seq; - print(cs1.getData()); - } - - if (seq instanceof String) { - String rs1 = (String) seq; - print(rs1.getBytes()); - } - } - - public boolean checkDictionarySectionOrder(Iterator it) { - ReplazableString prev = new ReplazableString(); - String lastStr = ""; - boolean error = false; - while (it.hasNext()) { - ByteString charSeq = ByteString.of(it.next()); - String str = charSeq.toString(); - - int cmp = prev.compareTo(charSeq); - - if (cmp >= 0) { - error = true; - if (cmp == 0) { - colorTool.error("Duplicated(bs)", prev + " == " + charSeq); - } else { - colorTool.error("Bad order(bs)", prev + " > " + charSeq); - } - } - - if (!unicode) { - int cmp2 = lastStr.compareTo(str); - - if (cmp2 >= 0) { - error = true; - if (cmp == 0) { - colorTool.error("Duplicated(str)", lastStr + " == " + str); - } else { - colorTool.error("Bad order(str)", lastStr + " > " + str); - } - } - - if (Math.signum(cmp) != Math.signum(cmp2)) { - error = true; - colorTool.error("Not equal", cmp + " != " + cmp2 + " for " + lastStr + " / " + str); - if (binary) { - print(prev); - print(charSeq); - print(lastStr); - print(str); - } - } - - lastStr = str; - } - - prev.replace(charSeq); - } - if (error) { - colorTool.warn("Not valid section"); - } else { - colorTool.log("valid section"); - } - return error; - } - - public void exec() throws Throwable { - try (HDT hdt = loadOrMap(parameters.get(0))) { - boolean error; - long count = 0; - if (hdt.getDictionary() instanceof MultipleBaseDictionary) { - colorTool.log("Checking subject entries"); - error = checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); - count += hdt.getDictionary().getSubjects().getNumberOfElements(); - colorTool.log("Checking predicate entries"); - error |= checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); - count += hdt.getDictionary().getPredicates().getNumberOfElements(); - colorTool.log("Checking object entries"); - Map allObjects = hdt.getDictionary().getAllObjects(); - for (Map.Entry entry : allObjects.entrySet()) { - CharSequence sectionName = entry.getKey(); - DictionarySection section = entry.getValue(); - colorTool.log("Checking object section " + sectionName); - error |= checkDictionarySectionOrder(section.getSortedEntries()); - count += section.getNumberOfElements(); - } - colorTool.log("Checking shared entries"); - error |= checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); - count += hdt.getDictionary().getShared().getNumberOfElements(); - } else { - colorTool.log("Checking subject entries"); - error = checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); - count += hdt.getDictionary().getSubjects().getNumberOfElements(); - colorTool.log("Checking predicate entries"); - error |= checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); - count += hdt.getDictionary().getPredicates().getNumberOfElements(); - colorTool.log("Checking object entries"); - error |= checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); - count += hdt.getDictionary().getObjects().getNumberOfElements(); - colorTool.log("Checking shared entries"); - error |= checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); - count += hdt.getDictionary().getShared().getNumberOfElements(); - } - - if (error) { - colorTool.error("This HDT isn't valid", true); - System.exit(-1); - } else { - colorTool.log(count + " element(s) parsed"); - colorTool.log(colorTool.color(0, 5, 0) + "This HDT is valid", true); - } - } - } - - public static void main(String[] args) throws Throwable { - HDTVerify verify = new HDTVerify(); - JCommander com = new JCommander(verify); - com.parse(args); - verify.colorTool = new ColorTool(verify.color, verify.quiet); - com.setProgramName("hdtVerify"); - if (verify.parameters.size() < 1) { - com.usage(); - System.exit(-1); - } - verify.exec(); - } + private HDTVerify() { + } + + @Parameter(description = "") + public List parameters = Lists.newArrayList(); + + @Parameter(names = "-unicode", description = "Ignore UNICODE order") + public boolean unicode; + + @Parameter(names = "-progress", description = "Show progression") + public boolean progress; + + @Parameter(names = "-color", description = "Print using color (if available)") + public boolean color; + + @Parameter(names = "-binary", description = "Print binaries of the string in case of signum error") + public boolean binary; + + @Parameter(names = "-quiet", description = "Do not show progress of the conversion") + public boolean quiet; + + @Parameter(names = "-load", description = "Load the HDT in memory for faster results (might be impossible for large a HDT)") + public boolean load; + + @Parameter(names = "-equals", description = "Test all the input HDTs are equals instead of checking validity") + public boolean equals; + + public ColorTool colorTool; + + private HDT loadOrMap(String file, ProgressListener listener) throws IOException { + return load ? HDTManager.loadHDT(file, listener) : HDTManager.mapHDT(file, listener); + } + + private void print(byte[] arr) { + for (byte b : arr) { + System.out.printf("%02X ", b); + } + System.out.println(); + } + + private void print(CharSequence seq) { + if (seq instanceof CompactString) { + CompactString cs1 = (CompactString) seq; + print(cs1.getData()); + } + + if (seq instanceof String) { + String rs1 = (String) seq; + print(rs1.getBytes()); + } + } + + public boolean checkDictionarySectionOrder(String name, DictionarySection section, MultiThreadListenerConsole console) { + Iterator it = section.getSortedEntries(); + long size = section.getNumberOfElements(); + IntermediateListener il = new IntermediateListener(console); + il.setPrefix(name + ": "); + ReplazableString prev = new ReplazableString(); + String lastStr = ""; + boolean error = false; + long count = 0; + while (it.hasNext()) { + ByteString charSeq = ByteString.of(it.next()); + String str = charSeq.toString(); + count++; + + int cmp = prev.compareTo(charSeq); + + if (cmp >= 0) { + error = true; + if (cmp == 0) { + colorTool.error("Duplicated(bs)", prev + " == " + charSeq); + } else { + colorTool.error("Bad order(bs)", prev + " > " + charSeq); + } + } + + if (!unicode) { + int cmp2 = lastStr.compareTo(str); + + if (cmp2 >= 0) { + error = true; + if (cmp == 0) { + colorTool.error("Duplicated(str)", lastStr + " == " + str); + } else { + colorTool.error("Bad order(str)", lastStr + " > " + str); + } + } + + if (Math.signum(cmp) != Math.signum(cmp2)) { + error = true; + colorTool.error("Not equal", cmp + " != " + cmp2 + " for " + lastStr + " / " + str); + if (binary) { + print(prev); + print(charSeq); + print(lastStr); + print(str); + } + } + + lastStr = str; + } + + if (count % 10_000 == 0) { + il.notifyProgress( + 100f * count / size, + "Verify (" + count + "/" + size + "): " + + colorTool.color(3, 3, 3) + + (str.length() > 17 ? (str.substring(0, 17) + "...") : str) + ); + } + + prev.replace(charSeq); + } + il.notifyProgress(100f, "Verify..."); + + if (error) { + colorTool.warn("Not valid section"); + } else { + colorTool.log("valid section"); + } + return error; + } + + public boolean assertHdtEquals(HDT hdt1, HDT hdt2, MultiThreadListenerConsole console, String desc) { + IntermediateListener il = new IntermediateListener(console); + il.setPrefix(desc + ": "); + if (hdt1.getTriples().getNumberOfElements() != hdt2.getTriples().getNumberOfElements()) { + colorTool.error("HDT with different number of elements!"); + return false; + } + + IteratorTripleString its1; + IteratorTripleString its2; + + try { + its1 = hdt1.search("", "", ""); + its2 = hdt2.search("", "", ""); + } catch (NotFoundException e) { + throw new AssertionError(e); + } + + long tripleError = 0; + long count = 0; + long size = hdt1.getTriples().getNumberOfElements(); + while (true) { + if (!its1.hasNext()) { + if (its2.hasNext()) { + colorTool.error("Bad iteration"); + break; + } + return true; + } + + if (!its2.hasNext()) { + colorTool.error("Bad iteration"); + return false; + } + + TripleString ts1 = its1.next(); + TripleString ts2 = its2.next(); + if (!ts1.equals(ts2)) { + colorTool.error("Triple not equal!", ts1 + "!=" + ts2); + tripleError++; + } + + count++; + + if (count % 10_000 == 0) { + String str = ts1.toString(); + il.notifyProgress( + 100f * count / size, + "Verify (" + count + "/" + size + "): " + + colorTool.color(3, 3, 3) + + (str.length() > 17 ? (str.substring(0, 17) + "...") : str) + ); + } + } + + return tripleError == 0; + } + + + public void exec() throws Throwable { + MultiThreadListenerConsole console = progress ? new MultiThreadListenerConsole(color) : null; + colorTool.setConsole(console); + List hdts = new ArrayList<>(parameters.size()); + + try { + for (String hdtLocation : parameters) { + hdts.add(loadOrMap(hdtLocation, console)); + } + if (equals) { + // we know that we have at least one HDT + HDT current = hdts.get(0); + + boolean error = false; + for (int i = 1; i < hdts.size(); i++) { + if (!assertHdtEquals(current, hdts.get(i), console, "#0?" + i)) { + colorTool.error("HDT NOT EQUALS!", "hdt#0 != hdt#" + i); + error = true; + } + } + + if (error) { + colorTool.error("HDTs not equal!", true); + System.exit(-1); + } else { + colorTool.log(colorTool.color(0, 5, 0) + "All the HDTs are equal", true); + } + + if (console != null) { + console.removeLast(); + } + + } else { + for (HDT hdtl : hdts) { + try (HDT hdt = hdtl) { + boolean error; + long count = 0; + if (hdt.getDictionary() instanceof MultipleBaseDictionary) { + colorTool.log("Checking subject entries"); + error = checkDictionarySectionOrder("subject", hdt.getDictionary().getSubjects(), console); + count += hdt.getDictionary().getSubjects().getNumberOfElements(); + colorTool.log("Checking predicate entries"); + error |= checkDictionarySectionOrder("predicate", hdt.getDictionary().getPredicates(), console); + count += hdt.getDictionary().getPredicates().getNumberOfElements(); + colorTool.log("Checking object entries"); + Map allObjects = hdt.getDictionary().getAllObjects(); + for (Map.Entry entry : allObjects.entrySet()) { + CharSequence sectionName = entry.getKey(); + DictionarySection section = entry.getValue(); + colorTool.log("Checking object section " + sectionName); + error |= checkDictionarySectionOrder("sectionName", section, console); + count += section.getNumberOfElements(); + } + colorTool.log("Checking shared entries"); + error |= checkDictionarySectionOrder("shared", hdt.getDictionary().getShared(), console); + count += hdt.getDictionary().getShared().getNumberOfElements(); + } else { + colorTool.log("Checking subject entries"); + error = checkDictionarySectionOrder("subject", hdt.getDictionary().getSubjects(), console); + count += hdt.getDictionary().getSubjects().getNumberOfElements(); + colorTool.log("Checking predicate entries"); + error |= checkDictionarySectionOrder("predicate", hdt.getDictionary().getPredicates(), console); + count += hdt.getDictionary().getPredicates().getNumberOfElements(); + colorTool.log("Checking object entries"); + error |= checkDictionarySectionOrder("object", hdt.getDictionary().getObjects(), console); + count += hdt.getDictionary().getObjects().getNumberOfElements(); + colorTool.log("Checking shared entries"); + error |= checkDictionarySectionOrder("shared", hdt.getDictionary().getShared(), console); + count += hdt.getDictionary().getShared().getNumberOfElements(); + } + + if (error) { + colorTool.error("This HDT isn't valid", true); + System.exit(-1); + } else { + colorTool.log(count + " element(s) parsed"); + colorTool.log(colorTool.color(0, 5, 0) + "This HDT is valid", true); + } + + if (console != null) { + console.removeLast(); + } + } + } + } + } catch (Throwable t) { + IOUtil.closeAll(hdts); + throw t; + } + + } + + public static void main(String[] args) throws Throwable { + HDTVerify verify = new HDTVerify(); + JCommander com = new JCommander(verify); + com.parse(args); + verify.colorTool = new ColorTool(verify.color, verify.quiet); + com.setProgramName("hdtVerify"); + if (verify.parameters.size() < 1) { + com.usage(); + System.exit(-1); + } + verify.exec(); + } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/ColorTool.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/ColorTool.java index f95c147e..07e77c6f 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/ColorTool.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/ColorTool.java @@ -3,9 +3,10 @@ public class ColorTool { private final boolean color; private final boolean quiet; + private MultiThreadListenerConsole console; public ColorTool(boolean color, boolean quiet) { - this.color = color; + this.color = color || MultiThreadListenerConsole.ALLOW_COLOR_SEQUENCE; this.quiet = quiet; } @@ -13,6 +14,17 @@ public ColorTool(boolean color) { this(color, false); } + public void setConsole(MultiThreadListenerConsole console) { + this.console = console; + } + + private void print(String str) { + if (console != null) { + console.printLine(str); + } else { + System.out.println(str); + } + } public String prefix(String pref, int r, int g, int b) { return colorReset() + "[" + color(r, g, b) + pref + colorReset() + "]"; @@ -23,13 +35,13 @@ public void log(String msg) { } public void log(String msg, boolean ignoreQuiet) { if (!quiet || ignoreQuiet) { - System.out.println(prefix("INFO", 3, 1, 5) + " " + colorReset() + msg); + print(prefix("INFO", 3, 1, 5) + " " + colorReset() + msg); } } public void logValue(String msg, String value, boolean ignoreQuiet) { if (!quiet || ignoreQuiet) { - System.out.println(color(3, 1, 5) + msg + colorReset() + value); + print(color(3, 1, 5) + msg + colorReset() + value); } } @@ -43,7 +55,7 @@ public void warn(String msg) { public void warn(String msg, boolean ignoreQuiet) { if (!quiet || ignoreQuiet) { - System.out.println(prefix("WARN", 5, 5, 0) + " " + colorReset() + msg); + print(prefix("WARN", 5, 5, 0) + " " + colorReset() + msg); } } public void error(String text) { @@ -62,9 +74,9 @@ public void error(String title, String text) { public void error(String title, String text, boolean ignoreQuiet) { if (!quiet || ignoreQuiet) { if (title != null) { - System.out.println(prefix("ERRR", 5, 0, 0) + " " + prefix(title, 5, 3, 0) + " " + colorReset() + text); + print(prefix("ERRR", 5, 0, 0) + " " + prefix(title, 5, 3, 0) + " " + colorReset() + text); } else { - System.out.println(prefix("ERRR", 5, 0, 0) + " " + colorReset() + text); + print(prefix("ERRR", 5, 0, 0) + " " + colorReset() + text); } } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java index 0815ff99..1d6eaee0 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java @@ -19,6 +19,11 @@ private static String goBackNLine(int line) { */ private static final boolean ALLOW_ASCII_SEQUENCE; + /** + * true if the system allow color sequence, false otherwise + */ + static final boolean ALLOW_COLOR_SEQUENCE; + static { String env; try { @@ -28,6 +33,15 @@ private static String goBackNLine(int line) { } ALLOW_ASCII_SEQUENCE = System.console() != null && !(env == null || env.isEmpty()); + + String envC; + try { + envC = System.getenv("RDFHDT_COLOR"); + } catch (SecurityException e) { + envC = null; + } + + ALLOW_COLOR_SEQUENCE = System.console() != null && "true".equalsIgnoreCase(envC); } private final Map threadMessages; @@ -39,7 +53,7 @@ public MultiThreadListenerConsole(boolean color) { } public MultiThreadListenerConsole(boolean color, boolean asciiListener) { - this.color = color; + this.color = color || ALLOW_COLOR_SEQUENCE; if (asciiListener) { threadMessages = new TreeMap<>(); } else { @@ -133,7 +147,25 @@ public synchronized void notifyProgress(String thread, float level, String messa } } + public synchronized void printLine(String line) { + render(line); + } + + public void removeLast() { + StringBuilder message = new StringBuilder(); + if (previous != 0) { + for (int i = 0; i < previous; i++) { + message.append(goBackNLine(1)).append(ERASE_LINE); + } + } + System.out.print(message); + } + private void render() { + render(null); + } + + private void render(String ln) { if (threadMessages == null) { return; } @@ -141,15 +173,22 @@ private void render() { int lines = threadMessages.size(); message.append("\r"); // go back each line of the thread message + if (previous != 0) { - message.append(goBackNLine(previous)); + for (int i = 0; i < previous; i++) { + message.append(goBackNLine(1)).append(ERASE_LINE); + } + } + + if (ln != null) { + message.append(ln).append("\n"); } int maxThreadNameSize = threadMessages.keySet().stream().mapToInt(String::length).max().orElse(0) + 1; // write each thread logs threadMessages.forEach((thread, msg) -> message - .append(ERASE_LINE) + .append('\r') .append(colorReset()).append("[").append(colorThread()).append(thread).append(colorReset()).append("]") .append(" ").append(".".repeat(maxThreadNameSize - thread.length())).append(" ") .append(msg).append("\n")); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java index 277e2c18..63408085 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java @@ -37,6 +37,8 @@ import org.rdfhdt.hdt.dictionary.impl.PSFCTempDictionary; import org.rdfhdt.hdt.dictionary.impl.WriteFourSectionDictionary; import org.rdfhdt.hdt.dictionary.impl.WriteMultipleSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.kcat.FourSectionDictionaryKCat; +import org.rdfhdt.hdt.dictionary.impl.kcat.MultipleSectionDictionaryKCat; import org.rdfhdt.hdt.exceptions.IllegalFormatException; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.hdt.impl.diskimport.MultiSectionSectionCompressor; @@ -49,8 +51,10 @@ import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.triples.TripleString; import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.string.ByteString; import java.nio.file.Path; +import java.util.TreeMap; /** * Factory that creates Dictionary objects @@ -164,6 +168,27 @@ public static DictionaryPrivate createWriteDictionary(HDTOptions spec, Path loca } } + /** + * Creates a write-dictionary + * + * @param name name of the HDT Dictionary type + * @param spec specs to read dictionary + * @param location write location + * @param bufferSize write buffer sizes + * @return WriteDictionary + */ + public static DictionaryPrivate createWriteDictionary(String name, HDTOptions spec, Path location, int bufferSize) { + switch (name) { + case "": + case HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION: + return new WriteFourSectionDictionary(spec, location, bufferSize); + case HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION: + return new WriteMultipleSectionDictionary(spec, location, bufferSize); + default: + throw new IllegalFormatException("Implementation of write dictionary not found for " + name); + } + } + public static SectionCompressor createSectionCompressor(HDTOptions spec, CloseSuppressPath baseFileName, AsyncIteratorFetcher source, MultiThreadListener listener, int bufferSize, @@ -221,4 +246,42 @@ public static DictionaryDiff createDictionaryDiff(Dictionary dictionary, String throw new IllegalFormatException("Implementation of DictionaryDiff not found for " + type); } } + + /** + * create {@link org.rdfhdt.hdt.dictionary.DictionaryKCat} for HDTCat + * + * @param dictionary dictionary + * @return dictionaryKCat + */ + public static DictionaryKCat createDictionaryKCat(Dictionary dictionary) { + String type = dictionary.getType(); + switch (type) { + case HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION: + return new FourSectionDictionaryKCat(dictionary); + case HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION: + return new MultipleSectionDictionaryKCat(dictionary); + default: + throw new IllegalArgumentException("Implementation of DictionaryKCat not found for " + type); + } + } + + public static DictionaryPrivate createWriteDictionary( + String type, + HDTOptions spec, + DictionarySectionPrivate subject, + DictionarySectionPrivate predicate, + DictionarySectionPrivate object, + DictionarySectionPrivate shared, + TreeMap sub + ) { + switch (type) { + case HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION: + case HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION: + return new WriteFourSectionDictionary(spec, subject, predicate, object, shared); + case HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION: + return new WriteMultipleSectionDictionary(spec, subject, predicate, shared, sub); + default: + throw new IllegalArgumentException("Unknown dictionary type " + type); + } + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryKCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryKCat.java new file mode 100644 index 00000000..793c19b7 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryKCat.java @@ -0,0 +1,56 @@ +package org.rdfhdt.hdt.dictionary; + +import java.util.Map; + +public interface DictionaryKCat { + + /** + * @return the subsections to merge + */ + Map getSubSections(); + + /** + * @return the subject section + */ + DictionarySection getSubjectSection(); + + /** + * @return the object section + */ + DictionarySection getObjectSection(); + + /** + * @return the predicate section + */ + DictionarySection getPredicateSection(); + + /** + * @return the shared section + */ + DictionarySection getSharedSection(); + + /** + * @return the number of subjects + */ + long countSubjects(); + + /** + * @return the number of shared + */ + long countShared(); + + /** + * @return the number of predicates + */ + long countPredicates(); + + /** + * @return the number of objects + */ + long countObjects(); + + /** + * @return the object shift in the dictionary IDs + */ + long objectShift(); +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java index 73549bac..45156c5f 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java @@ -1,5 +1,7 @@ package org.rdfhdt.hdt.dictionary.impl; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection; import org.rdfhdt.hdt.exceptions.NotImplementedException; @@ -23,6 +25,7 @@ /** * Version of four section dictionary with {@link org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection} + * * @author Antoine Willerval */ public class WriteFourSectionDictionary extends BaseDictionary { @@ -35,6 +38,17 @@ public WriteFourSectionDictionary(HDTOptions spec, Path filename, int bufferSize shared = new WriteDictionarySection(spec, filename.resolveSibling(name + "SH"), bufferSize); } + public WriteFourSectionDictionary(HDTOptions spec, DictionarySectionPrivate subjects, + DictionarySectionPrivate predicates, + DictionarySectionPrivate objects, + DictionarySectionPrivate shared) { + super(spec); + this.subjects = subjects; + this.predicates = predicates; + this.objects = objects; + this.shared = shared; + } + @Override public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { MultiThreadListener ml = ListenerUtil.multiThreadListener(listener); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java index c7b1f13a..84a97846 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteMultipleSectionDictionary.java @@ -49,6 +49,22 @@ public WriteMultipleSectionDictionary(HDTOptions spec, Path filename, int buffer objects = new TreeMap<>(); shared = new WriteDictionarySection(spec, filename.resolveSibling(name + "SH"), bufferSize); } + public WriteMultipleSectionDictionary(HDTOptions spec, + DictionarySectionPrivate subjects, + DictionarySectionPrivate predicates, + DictionarySectionPrivate shared, + TreeMap objects) { + super(spec); + // useless + this.filename = null; + this.bufferSize = 0; + + // write sections + this.subjects = subjects; + this.predicates = predicates; + this.objects = objects; + this.shared = shared; + } @Override public long getNAllObjects() { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/FourSectionDictionaryKCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/FourSectionDictionaryKCat.java new file mode 100644 index 00000000..d8ddcdc2 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/FourSectionDictionaryKCat.java @@ -0,0 +1,67 @@ +package org.rdfhdt.hdt.dictionary.impl.kcat; + +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.DictionaryKCat; +import org.rdfhdt.hdt.dictionary.DictionarySection; + +import java.util.Collections; +import java.util.Map; + +public class FourSectionDictionaryKCat implements DictionaryKCat { + private final Dictionary dictionary; + + public FourSectionDictionaryKCat(Dictionary dictionary) { + this.dictionary = dictionary; + } + + @Override + public Map getSubSections() { + return Collections.emptyMap(); + } + + @Override + public DictionarySection getSubjectSection() { + return dictionary.getSubjects(); + } + + @Override + public DictionarySection getObjectSection() { + return dictionary.getObjects(); + } + + @Override + public DictionarySection getPredicateSection() { + return dictionary.getPredicates(); + } + + @Override + public DictionarySection getSharedSection() { + return dictionary.getShared(); + } + + @Override + public long countSubjects() { + return dictionary.getSubjects().getNumberOfElements() + countShared(); + } + + @Override + public long countShared() { + return dictionary.getShared().getNumberOfElements(); + } + + + @Override + public long countPredicates() { + return dictionary.getPredicates().getNumberOfElements(); + } + + @Override + public long countObjects() { + return dictionary.getObjects().getNumberOfElements() + countShared(); + } + + @Override + public long objectShift() { + return countShared(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/GroupBySubjectMapIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/GroupBySubjectMapIterator.java new file mode 100644 index 00000000..17af745c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/GroupBySubjectMapIterator.java @@ -0,0 +1,359 @@ +package org.rdfhdt.hdt.dictionary.impl.kcat; + +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.iterator.utils.CombinedIterator; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.PeekIterator; +import org.rdfhdt.hdt.triples.IteratorTripleID; +import org.rdfhdt.hdt.triples.TripleID; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * @author Antoine Willerval + */ +public class GroupBySubjectMapIterator implements Iterator { + private final PeekIterator mergeIterator; + private final List groupList = new ArrayList<>(); + private Iterator groupListIterator; + private TripleID next; + + public GroupBySubjectMapIterator(Iterator mergeIterator) { + this.mergeIterator = new PeekIterator<>(mergeIterator); + } + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + + // get triples from the group + if (groupListIterator != null) { + if (groupListIterator.hasNext()) { + next = groupListIterator.next(); + return true; + } + + // clear the group and set to new iteration + groupList.clear(); + groupListIterator = null; + } + + // do we have more elements? + if (!mergeIterator.hasNext()) { + return false; + } + + long subject = mergeIterator.peek().getSubject(); + + // we add all the elements while the subject are the same + do { + groupList.add(mergeIterator.next().clone()); + } while (mergeIterator.hasNext() && mergeIterator.peek().getSubject() == subject); + + groupList.sort(TripleID::compareTo); + + groupListIterator = groupList.iterator(); + + if (groupListIterator.hasNext()) { + next = groupListIterator.next(); + return true; + } + + // just to be sure + return false; + } + + @Override + public TripleID next() { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } + + private static long firstSubjectTripleId(HDT hdt) { + IteratorTripleID it = hdt.getTriples().search(new TripleID( + hdt.getDictionary().getNshared() + 1, + 0, + 0 + )); + if (it.hasNext()) { + // extract result + it.next(); + return it.getLastTriplePosition(); + } else { + return -1; + } + } + + public static Iterator fromHDTs(KCatMerger merger, HDT[] hdts) { + final long shared = merger.getCountShared(); + List> sharedSubjectIterators = IntStream.range(0, hdts.length) + .mapToObj(hdtIndex -> { + // extract hdt elements for this index + HDT hdt = hdts[hdtIndex]; + + // get the first subject triple id + long firstSubjectTripleId = firstSubjectTripleId(hdt); + + // create a subject iterator, mapped to the new IDs + IteratorTripleID subjectIterator = hdt.getTriples().searchAll(); + subjectIterator.goTo(firstSubjectTripleId); + ExceptionIterator subjectIteratorMapped = ExceptionIterator.of( + new SharedOnlyIterator( + new MapIterator<>(subjectIterator, (tid) -> { + assert inHDT(tid, hdts[hdtIndex]); + return merger.extractMapped(hdtIndex, tid); + }), + shared + ) + ); + + if (shared == 0) { + return subjectIteratorMapped; + } + + Iterator sharedIterator = new SharedStopIterator(hdt.getTriples().searchAll(), hdt.getDictionary().getNshared()); + Iterator sharedIteratorMapped = new MapIterator<>(sharedIterator, (tid) -> { + assert inHDT(tid, hdts[hdtIndex]); + return merger.extractMapped(hdtIndex, tid); + }); + + return new MergeExceptionIterator<>( + subjectIteratorMapped, + ExceptionIterator.of(sharedIteratorMapped), + Comparator.comparingLong(TripleID::getSubject) + ); + }).collect(Collectors.toList()); + List> subjectIterators = IntStream.range(0, hdts.length) + .mapToObj(hdtIndex -> { + // extract hdt elements for this index + HDT hdt = hdts[hdtIndex]; + + // get the first subject triple id + long firstSubjectTripleId = firstSubjectTripleId(hdt); + + // create a subject iterator, mapped to the new IDs + IteratorTripleID subjectIterator = hdt.getTriples().searchAll(); + subjectIterator.goTo(firstSubjectTripleId); + + return ExceptionIterator.of( + new NoSharedIterator( + new MapIterator<>(subjectIterator, (tid) -> merger.extractMapped(hdtIndex, tid)), + shared + ) + ); + }).collect(Collectors.toList()); + return new GroupBySubjectMapIterator( + new NoDupeTripleIDIterator( + CombinedIterator.combine(List.of( + MergeExceptionIterator.buildOfTree( + Function.identity(), + Comparator.comparingLong(TripleID::getSubject), + sharedSubjectIterators, + 0, + sharedSubjectIterators.size() + ).asIterator(), + MergeExceptionIterator.buildOfTree( + Function.identity(), + Comparator.comparingLong(TripleID::getSubject), + subjectIterators, + 0, + subjectIterators.size() + ).asIterator() + )) + )); + } + + private static boolean inHDT(TripleID id, HDT hdt) { + long s = id.getSubject(); + long p = id.getPredicate(); + long o = id.getObject(); + return s >= 1 && s <= hdt.getDictionary().getNsubjects() + && p >= 1 && p <= hdt.getDictionary().getNpredicates() + && o >= 1 && o <= hdt.getDictionary().getNobjects(); + } + + private static class NoDupeTripleIDIterator implements Iterator { + private TripleID next; + private final PeekIterator it; + + public NoDupeTripleIDIterator(Iterator it) { + this.it = new PeekIterator<>(it); + } + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + if (!it.hasNext()) { + return false; + } + + next = it.next(); + + assert next.isValid() : "Can't have empty tripleID"; + + // pass all the duplicated fields + while (it.hasNext() && it.peek().equals(next)) { + it.next(); + } + + return true; + } + + @Override + public TripleID next() { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } + } + + private static class SharedStopIterator implements Iterator { + private final Iterator it; + private final long shared; + private TripleID next; + + private SharedStopIterator(Iterator it, long shared) { + this.it = it; + this.shared = shared; + } + + + @Override + public boolean hasNext() { + if (next != null) { + return next.getSubject() <= shared; + } + + if (!it.hasNext()) { + return false; + } + + next = it.next(); + + return next.getSubject() <= shared; + } + + @Override + public TripleID next() { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } + } + + private static class SharedOnlyIterator implements Iterator { + private final Iterator it; + private final long shared; + private TripleID next; + + private SharedOnlyIterator(Iterator it, long shared) { + this.it = it; + this.shared = shared; + } + + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + + // search over the next results + while (it.hasNext()) { + TripleID next = it.next(); + + // is this element a shared element? + if (next.getSubject() <= shared) { + this.next = next; + return true; + } + } + + return false; + } + + @Override + public TripleID next() { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } + } + + private static class NoSharedIterator implements Iterator { + private final Iterator it; + private final long shared; + private TripleID next; + + private NoSharedIterator(Iterator it, long shared) { + this.it = it; + this.shared = shared; + } + + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + + // search over the next results + while (it.hasNext()) { + TripleID next = it.next(); + + // is this element a shared element? + if (next.getSubject() > shared) { + this.next = next; + return true; + } + } + + return false; + } + + @Override + public TripleID next() { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatImpl.java new file mode 100644 index 00000000..82136015 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatImpl.java @@ -0,0 +1,226 @@ +package org.rdfhdt.hdt.dictionary.impl.kcat; + +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdt.HDTManagerImpl; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.hdt.impl.HDTBase; +import org.rdfhdt.hdt.hdt.impl.WriteHDTImpl; +import org.rdfhdt.hdt.header.HeaderFactory; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.Triples; +import org.rdfhdt.hdt.triples.impl.BitmapTriples; +import org.rdfhdt.hdt.triples.impl.OneReadTempTriples; +import org.rdfhdt.hdt.triples.impl.WriteBitmapTriples; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.ListIterator; + +/** + * Implementation of HDTCat_K algorithm + * + * @author Antoine Willerval + */ +public class KCatImpl implements Closeable { + private static TripleComponentOrder getOrder(HDT hdt) { + Triples triples = hdt.getTriples(); + if (!(triples instanceof BitmapTriples)) { + throw new IllegalArgumentException("HDT Triples can't be BitmapTriples"); + } + + BitmapTriples bt = (BitmapTriples) triples; + + return bt.getOrder(); + } + + private final String baseURI; + final HDT[] hdts; + private final CloseSuppressPath location; + private final Path futureLocation; + private final boolean futureMap; + private final boolean clearLocation; + private final ProgressListener listener; + private final String dictionaryType; + private final int bufferSize; + private final HDTOptions hdtFormat; + private final TripleComponentOrder order; + private final long rawSize; + + /** + * Create implementation + * + * @param hdtFileNames the hdt files to cat + * @param hdtFormat the format to config the cat + * @param listener listener to get information from the cat + * @throws IOException io exception during loading + */ + public KCatImpl(List hdtFileNames, HDTOptions hdtFormat, ProgressListener listener) throws IOException { + this.listener = ListenerUtil.multiThreadListener(listener); + + hdts = new HDT[hdtFileNames.size()]; + this.hdtFormat = hdtFormat; + + long bufferSizeLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_BUFFER_SIZE_KEY, CloseSuppressPath.BUFFER_SIZE); + if (bufferSizeLong > Integer.MAX_VALUE - 5L || bufferSizeLong <= 0) { + throw new IllegalArgumentException("Buffer size can't be negative or bigger than the size of an array!"); + } else { + bufferSize = (int) bufferSizeLong; + } + + try { + ListIterator it = hdtFileNames.listIterator(); + + int firstIndex = it.nextIndex(); + String firstHDTFile = it.next(); + + HDT firstHDT = HDTManagerImpl.loadOrMapHDT(firstHDTFile, listener, hdtFormat); + hdts[firstIndex] = firstHDT; + + dictionaryType = firstHDT.getDictionary().getType(); + baseURI = firstHDT.getBaseURI(); + order = getOrder(firstHDT); + + long rawSize = HDTBase.getRawSize(firstHDT.getHeader()); + + + IntermediateListener iListener = new IntermediateListener(listener); + iListener.setRange(0, 10); + // map all the HDTs + while (it.hasNext()) { + int index = it.nextIndex(); + String hdtFile = it.next(); + + iListener.notifyProgress(index * 100f / hdtFileNames.size(), "map hdt (" + (index + 1) + "/" + hdtFileNames.size() + ")"); + + HDT hdt = HDTManagerImpl.loadOrMapHDT(hdtFile, listener, hdtFormat); + + rawSize = rawSize == -1 ? -1 : HDTBase.getRawSize(hdt.getHeader()); + + hdts[index] = hdt; + + // try that all the HDTs have the same type + if (!dictionaryType.equals(hdt.getDictionary().getType())) { + throw new IllegalArgumentException("Trying to cat hdt with different type, type(hdt0) [" + dictionaryType + "] != type(hdt" + index + ") [" + hdt.getDictionary().getType() + "]"); + } + TripleComponentOrder order = getOrder(hdt); + + if (!order.equals(this.order)) { + throw new IllegalArgumentException("Trying to cat hdt with different order, order(hdt0) [" + this.order + "] != type(hdt" + index + ") [" + order + "]"); + } + } + + this.rawSize = rawSize; + + String hdtcatLocationOpt = hdtFormat.get(HDTOptionsKeys.HDTCAT_LOCATION); + if (hdtcatLocationOpt == null || hdtcatLocationOpt.isEmpty()) { + location = CloseSuppressPath.of(Files.createTempDirectory("hdtCat")); + clearLocation = true; // delete temp directory + } else { + location = CloseSuppressPath.of(hdtcatLocationOpt); + Files.createDirectories(location); + clearLocation = hdtFormat.getBoolean(HDTOptionsKeys.HDTCAT_DELETE_LOCATION, true); + } + + String hdtcatFutureLocationOpt = hdtFormat.get(HDTOptionsKeys.HDTCAT_FUTURE_LOCATION); + if (hdtcatFutureLocationOpt == null || hdtcatFutureLocationOpt.isEmpty()) { + futureLocation = location.resolve("gen.hdt"); + futureMap = false; + } else { + futureLocation = Path.of(hdtcatFutureLocationOpt); + futureMap = true; + } + + location.closeWithDeleteRecurse(); + } catch (Throwable t) { + for (HDT hdt : hdts) { + IOUtil.closeQuietly(hdt); + } + throw t; + } + } + + /** + * @return a merger from the config + * @throws IOException io exception + */ + KCatMerger createMerger(ProgressListener listener) throws IOException { + return new KCatMerger(hdts, location, listener, bufferSize, dictionaryType, hdtFormat); + } + + /** + * @return a cat from the config HDTs + * @throws IOException io exception + */ + public HDT cat() throws IOException { + IntermediateListener il = new IntermediateListener(listener); + String futureLocationStr = futureLocation.toAbsolutePath().toString(); + il.setRange(0, 40); + il.setPrefix("Merge Dict: "); + try (KCatMerger merger = createMerger(il)) { + // create the dictionary + try (DictionaryPrivate dictionary = merger.buildDictionary()) { + assert merger.assertReadCorrectly(); + // create a GROUP BY subject iterator to get the new ordered stream + Iterator tripleIterator = GroupBySubjectMapIterator.fromHDTs(merger, hdts); + try (WriteBitmapTriples triples = new WriteBitmapTriples(hdtFormat, location.resolve("triples"), bufferSize)) { + long count = Arrays.stream(hdts).mapToLong(h -> h.getTriples().getNumberOfElements()).sum(); + + il.setRange(40, 80); + il.setPrefix("Merge triples: "); + triples.load(new OneReadTempTriples(tripleIterator, order, count), il); + + WriteHDTImpl writeHDT = new WriteHDTImpl(hdtFormat, location, dictionary, triples, HeaderFactory.createHeader(hdtFormat)); + writeHDT.populateHeaderStructure(baseURI); + // add a raw size from the previous values (if available) + if (rawSize != -1) { + writeHDT.getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, String.valueOf(rawSize)); + } + + il.setRange(80, 90); + il.setPrefix("Save HDT: "); + writeHDT.saveToHDT(futureLocationStr, il); + } + } + + } catch (InterruptedException e) { + throw new IOException("Interruption", e); + } + + il.setRange(90, 100); + HDT hdt; + if (futureMap) { + hdt = HDTManager.mapHDT(futureLocationStr, il); + } else { + hdt = HDTManager.loadHDT(futureLocationStr, il); + Files.deleteIfExists(futureLocation); + } + il.notifyProgress(100, "cat done."); + return hdt; + } + + @Override + public void close() throws IOException { + try { + IOUtil.closeAll(hdts); + } finally { + if (clearLocation) { + location.close(); + } + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMerger.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMerger.java new file mode 100644 index 00000000..d59983fd --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMerger.java @@ -0,0 +1,861 @@ +package org.rdfhdt.hdt.dictionary.impl.kcat; + +import org.rdfhdt.hdt.compact.sequence.SequenceLog64BigDisk; +import org.rdfhdt.hdt.dictionary.DictionaryFactory; +import org.rdfhdt.hdt.dictionary.DictionaryKCat; +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.impl.section.OneReadDictionarySection; +import org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection; +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.concurrent.SyncSeq; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.Closer; +import org.rdfhdt.hdt.util.string.ByteString; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +/** + * Class to merge multiple dictionaries into S/O/SH streams with map writing + * + * @author Antoine Willerval + */ +public class KCatMerger implements AutoCloseable { + private static final long SHARED_MASK = 0b01; + private static final long TYPED_MASK = 0b10; + + final HDT[] hdts; + + private final ProgressListener listener; + private final CloseSuppressPath[] locations; + final SyncSeq[] subjectsMaps; + final SyncSeq[] predicatesMaps; + final SyncSeq[] objectsMaps; + private final ExceptionThread catMergerThread; + final boolean typedHDT; + private final int shift; + private final String dictionaryType; + + private final PipedCopyIterator subjectPipe = new PipedCopyIterator<>(); + private final PipedCopyIterator objectPipe = new PipedCopyIterator<>(); + private final PipedCopyIterator sharedPipe = new PipedCopyIterator<>(); + private final DuplicateBufferIterator sortedSubject; + private final DuplicateBufferIterator sortedObject; + private final DuplicateBufferIterator sortedPredicates; + private final Map> sortedSubSections; + + private final long estimatedSizeP; + private final AtomicLong countTyped = new AtomicLong(); + private final AtomicLong countShared = new AtomicLong(); + final AtomicLong[] countSubject; + final AtomicLong[] countObject; + + private final WriteDictionarySection sectionSubject; + private final WriteDictionarySection sectionShared; + private final WriteDictionarySection sectionObject; + private final WriteDictionarySection sectionPredicate; + private final Map sectionSub; + private final Map typeId = new HashMap<>(); + private boolean running; + + /** + * Create KCatMerger + * + * @param hdts the hdts to cat + * @param location working location + * @param listener listener to log the state + * @param bufferSize buffer size + * @param dictionaryType dictionary type + * @param spec spec to config the HDT + * @throws java.io.IOException io exception + */ + public KCatMerger(HDT[] hdts, CloseSuppressPath location, ProgressListener listener, int bufferSize, String dictionaryType, HDTOptions spec) throws IOException { + this.hdts = hdts; + this.listener = listener; + this.dictionaryType = dictionaryType; + + DictionaryKCat[] cats = new DictionaryKCat[hdts.length]; + subjectsMaps = new SyncSeq[hdts.length]; + predicatesMaps = new SyncSeq[hdts.length]; + objectsMaps = new SyncSeq[hdts.length]; + locations = new CloseSuppressPath[hdts.length * 3]; + + countSubject = IntStream.range(0, hdts.length).mapToObj(i -> new AtomicLong()).toArray(AtomicLong[]::new); + countObject = IntStream.range(0, hdts.length).mapToObj(i -> new AtomicLong()).toArray(AtomicLong[]::new); + + long sizeS = 0; + long sizeP = 0; + long sizeO = 0; + long sizeShared = 0; + + Map subSections = new TreeMap<>(); + + for (int i = 0; i < cats.length; i++) { + DictionaryKCat cat = DictionaryFactory.createDictionaryKCat(hdts[i].getDictionary()); + + // compute max allocated sizes + sizeS += cat.countSubjects(); + sizeP += cat.countPredicates(); + sizeO += cat.countObjects(); + sizeShared += cat.countShared(); + + long start = 1L + cat.countShared(); + // compute allocated sizes for HDT with sub sections + for (Map.Entry e : cat.getSubSections().entrySet()) { + CharSequence key = e.getKey(); + DictionarySection section = e.getValue(); + + PreIndexSection[] sections = subSections.computeIfAbsent( + ByteString.of(key), + k -> new PreIndexSection[cats.length] + ); + sections[i] = new PreIndexSection(start, section); + start += section.getNumberOfElements(); + } + cats[i] = cat; + } + // if this HDT is typed, we don't have to allocate 1 bit / node to note a typed node + this.typedHDT = !subSections.isEmpty(); + if (typedHDT) { + shift = 2; + } else { + shift = 1; + } + + this.estimatedSizeP = sizeP; + try { + // create maps, allocate more bits for the shared part + int numbitsS = BitUtil.log2(sizeS + 1 + sizeShared) + 1 + shift; + int numbitsP = BitUtil.log2(sizeP + 1); + int numbitsO = BitUtil.log2(sizeO + 1 + sizeShared) + 1 + shift; + for (int i = 0; i < cats.length; i++) { + DictionaryKCat cat = cats[i]; + subjectsMaps[i] = new SyncSeq(new SequenceLog64BigDisk((locations[i * 3] = location.resolve("subjectsMap_" + i)).toAbsolutePath().toString(), numbitsS, cat.countSubjects() + 1)); + predicatesMaps[i] = new SyncSeq(new SequenceLog64BigDisk((locations[i * 3 + 1] = location.resolve("predicatesMap_" + i)).toAbsolutePath().toString(), numbitsP, cat.countPredicates() + 1)); + objectsMaps[i] = new SyncSeq(new SequenceLog64BigDisk((locations[i * 3 + 2] = location.resolve("objectsMap_" + i)).toAbsolutePath().toString(), numbitsO, cat.countObjects() + 1)); + } + + // merge the subjects/objects/shared from all the HDTs + sortedSubject = mergeSection( + cats, + (hdtIndex, c) -> createMergeIt( + hdtIndex, + c.getSubjectSection().getSortedEntries(), + c.getSharedSection().getSortedEntries(), + c.countShared() + ) + ); + sortedObject = mergeSection( + cats, + (hdtIndex, c) -> createMergeIt( + hdtIndex, + c.getObjectSection().getSortedEntries(), + c.getSharedSection().getSortedEntries(), + c.objectShift() + ) + ); + + // merge the other sections + sortedPredicates = mergeSection(cats, (hdtIndex, c) -> { + ExceptionIterator of = ExceptionIterator.of(c.getPredicateSection().getSortedEntries()); + return of.map(((element, index) -> new LocatedIndexedNode(hdtIndex, index + 1, ByteString.of(element)))); + }); + + sortedSubSections = new TreeMap<>(); + // create a merge section for each section + subSections.forEach((key, sections) -> sortedSubSections.put(key, mergeSection(sections, (hdtIndex, pre) -> { + ExceptionIterator of = ExceptionIterator.of(pre.getSortedEntries()); + return of.map(((element, index) -> new LocatedIndexedNode(hdtIndex, pre.getStart() + index, ByteString.of(element)))); + }))); + + // convert the dupe buffer streams to byte string streams + + Iterator subject = subjectPipe.mapWithId((db, id) -> { + long header = withEmptyHeader(id + 1); + db.stream().forEach(node -> { + SyncSeq map = subjectsMaps[node.getHdt()]; + assert map.get(node.getIndex()) == 0 : "overwriting previous subject value"; + map.set(node.getIndex(), header); + countSubject[node.getHdt()].incrementAndGet(); + }); + return db.peek(); + }); + + Iterator object = objectPipe.mapWithId((db, id) -> { + long header = withEmptyHeader(id + 1); + db.stream().forEach(node -> { + SyncSeq map = objectsMaps[node.getHdt()]; + assert map.get(node.getIndex()) == 0 : "overwriting previous object value"; + assert node.getIndex() >= 1 && node.getIndex() <= hdts[node.getHdt()].getDictionary().getNobjects(); + map.set(node.getIndex(), header); + countObject[node.getHdt()].incrementAndGet(); + }); + return db.peek(); + }); + + // left = subjects + // right = objects + Iterator shared = sharedPipe.mapWithId((bdb, id) -> { + long header = withSharedHeader(id + 1); + countShared.incrementAndGet(); + // left = subjects + bdb.getLeft().stream().forEach(node -> { + SyncSeq map = subjectsMaps[node.getHdt()]; + assert map.get(node.getIndex()) == 0 : "overwriting previous subject value"; + map.set(node.getIndex(), header); + countSubject[node.getHdt()].incrementAndGet(); + }); + // right = objects + bdb.getRight().stream().forEach(node -> { + SyncSeq map = objectsMaps[node.getHdt()]; + assert map.get(node.getIndex()) == 0 : "overwriting previous object value"; + assert node.getIndex() >= 1 && node.getIndex() <= hdts[node.getHdt()].getDictionary().getNobjects(); + map.set(node.getIndex(), header); + countObject[node.getHdt()].incrementAndGet(); + }); + return bdb.peek(); + }); + + sectionSubject = new WriteDictionarySection(spec, location.resolve("sortedSubject"), bufferSize); + sectionShared = new WriteDictionarySection(spec, location.resolve("sortedShared"), bufferSize); + sectionObject = new WriteDictionarySection(spec, location.resolve("sortedObject"), bufferSize); + sectionPredicate = new WriteDictionarySection(spec, location.resolve("sortedPredicate"), bufferSize); + sectionSub = new TreeMap<>(); + sortedSubSections.keySet().forEach((key) -> sectionSub.put(key, new WriteDictionarySection(spec, location.resolve("sortedSub" + getTypeId(key)), bufferSize))); + + catMergerThread = new ExceptionThread(this::runSharedCompute, "KCatMergerThreadShared") + .attach(new ExceptionThread(this::runSubSectionCompute, "KCatMergerThreadSubSection")) + .attach(new ExceptionThread(createWriter(sectionSubject, sizeS, subject), "KCatMergerThreadWriterS")) + .attach(new ExceptionThread(createWriter(sectionShared, sizeS + sizeO - sizeShared, shared), "KCatMergerThreadWriterSH")) + .attach(new ExceptionThread(createWriter(sectionObject, sizeO, object), "KCatMergerThreadWriterO")); + } catch (Throwable t) { + try { + throw t; + } finally { + close(); + } + } + } + + private static ExceptionIterator createMergeIt(int hdtIndex, Iterator subjectObject, Iterator shared, long sharedCount) { + return MergeExceptionIterator.buildOfTree(List.of( + MapIterator.of(subjectObject, (element, index) -> + new LocatedIndexedNode(hdtIndex, sharedCount + index + 1, ByteString.of(element))) + .asExceptionIterator(), + MapIterator.of(shared, (element, index) -> + new LocatedIndexedNode(hdtIndex, index + 1, ByteString.of(element))) + .asExceptionIterator() + )); + } + + /** + * create a sorted LocatedIndexedNode iterator from an array of sections + * + * @param sections the sections + * @return iterator + */ + public static DuplicateBufferIterator mergeSection(T[] sections, MergerFunction mapper) { + return new DuplicateBufferIterator<>( + MergeExceptionIterator + .buildOfTree( + (Integer hdtIndex, T e) -> { + if (e == null) { + // empty section (not defined for this HDT) + return ExceptionIterator.empty(); + } + // convert all the entries into located nodes + return mapper.apply(hdtIndex, e); + }, + LocatedIndexedNode::compareTo, + List.of(sections), + 0, + sections.length + ), + sections.length + ); + } + + /** + * get an UID for a type + * + * @param str the type + * @return UID + */ + public int getTypeId(ByteString str) { + return typeId.computeIfAbsent(str, (key) -> typeId.size()); + } + + /** + * add a typed header to this value + * + * @param value value + * @return header value + * @see #withEmptyHeader(long) + * @see #withSharedHeader(long) + */ + public long withTypedHeader(long value) { + assert value != 0 : "value can't be 0!"; + return (value << shift) | TYPED_MASK; + } + + /** + * add a shared header to this value + * + * @param value value + * @return header value + * @see #withEmptyHeader(long) + * @see #withTypedHeader(long) + */ + public long withSharedHeader(long value) { + assert value != 0 : "value can't be 0!"; + return (value << shift) | SHARED_MASK; + } + + /** + * add a header to this value + * + * @param value value + * @return header value + * @see #withTypedHeader(long) + * @see #withSharedHeader(long) + */ + public long withEmptyHeader(long value) { + assert value != 0 : "value can't be 0!"; + return value << shift; + } + + boolean assertReadCorrectly() { + for (int i = 0; i < hdts.length; i++) { + HDT hdt = hdts[i]; + assert countObject[i].get() == hdt.getDictionary().getNobjects(); + assert countSubject[i].get() == hdt.getDictionary().getNsubjects(); + } + return true; + } + + /** + * test if a header value is shared + * + * @param headerValue header value + * @return true if the header is shared, false otherwise + */ + public boolean isShared(long headerValue) { + return (headerValue & SHARED_MASK) != 0; + } + + /** + * test if a header value is typed + * + * @param headerValue header value + * @return true if the header is typed, false otherwise + */ + public boolean isTyped(long headerValue) { + return typedHDT && (headerValue & TYPED_MASK) != 0; + } + + /** + * wait for the merger to complete + * + * @throws InterruptedException thread interruption + */ + public DictionaryPrivate buildDictionary() throws InterruptedException { + synchronized (this) { + if (!running) { + startMerger(); + } + } + catMergerThread.joinAndCrashIfRequired(); + + return DictionaryFactory.createWriteDictionary( + dictionaryType, + null, + getSectionSubject(), + getSectionPredicate(), + getSectionObject(), + getSectionShared(), + getSectionSub() + ); + } + + private void runSharedCompute() { + // merge the sections + try { + sharedLoop: + while (sortedObject.hasNext() && sortedSubject.hasNext()) { + // last was a shared node + DuplicateBuffer newSubject = sortedSubject.next(); + DuplicateBuffer newObject = sortedObject.next(); + int comp = newSubject.compareTo(newObject); + while (comp != 0) { + if (comp < 0) { + subjectPipe.addElement(newSubject.trim()); + if (!sortedSubject.hasNext()) { + // no more subjects, send the current object and break the shared loop + objectPipe.addElement(newObject.trim()); + break sharedLoop; + } + newSubject = sortedSubject.next(); + } else { + objectPipe.addElement(newObject.trim()); + if (!sortedObject.hasNext()) { + // no more objects, send the current subject and break the shared loop + subjectPipe.addElement(newSubject.trim()); + break sharedLoop; + } + newObject = sortedObject.next(); + } + comp = newSubject.compareTo(newObject); + } + + // shared element + sharedPipe.addElement(newSubject.trim().asBi(newObject.trim())); + } + + // at least one iterator is empty, closing the shared pipe + sharedPipe.closePipe(); + // do we have subjects? + while (sortedSubject.hasNext()) { + subjectPipe.addElement(sortedSubject.next().trim()); + } + subjectPipe.closePipe(); + // do we have objects? + while (sortedObject.hasNext()) { + objectPipe.addElement(sortedObject.next().trim()); + } + objectPipe.closePipe(); + } catch (Throwable t) { + objectPipe.closePipe(t); + subjectPipe.closePipe(t); + sharedPipe.closePipe(t); + throw t; + } + } + + private void runSubSectionCompute() { + // convert all the sections + + // load predicates + sectionPredicate.load(new OneReadDictionarySection(sortedPredicates.map((db, id) -> { + db.stream().forEach(node -> { + SyncSeq map = predicatesMaps[node.getHdt()]; + assert map.get(node.getIndex()) == 0 : "overwriting previous predicate value"; + map.set(node.getIndex(), id + 1); + }); + return db.peek(); + }).asIterator(), estimatedSizeP), null); + + long shift = 1L; + // load data typed sections + for (Map.Entry e : sectionSub.entrySet()) { + ByteString key = e.getKey(); + WriteDictionarySection section = e.getValue(); + + DuplicateBufferIterator bufferIterator = sortedSubSections.get(key); + + final long currentShift = shift; + section.load(new OneReadDictionarySection(bufferIterator.map((db, id) -> { + long headerID = withTypedHeader(id + currentShift); + countTyped.incrementAndGet(); + db.stream().forEach(node -> { + SyncSeq map = objectsMaps[node.getHdt()]; + assert map.get(node.getIndex()) == 0 : "overwriting previous object value"; + assert node.getIndex() >= 1 && node.getIndex() <= hdts[node.getHdt()].getDictionary().getNobjects(); + map.set(node.getIndex(), headerID); + countObject[node.getHdt()].incrementAndGet(); + }); + return db.peek(); + }).asIterator(), estimatedSizeP), null); + shift += section.getNumberOfElements(); + } + } + + private ExceptionThread.ExceptionRunnable createWriter(DictionarySectionPrivate sect, long size, Iterator iterator) { + // convert all the sections + return () -> sect.load(new OneReadDictionarySection(iterator, size), listener); + } + + @Override + public void close() throws IOException { + try { + if (catMergerThread != null) { + catMergerThread.joinAndCrashIfRequired(); + } + } catch (InterruptedException e) { + throw new RuntimeException(e); + } finally { + Closer.of(sectionSubject, sectionPredicate, sectionObject, sectionShared) + .with(sectionSub.values()) + .with(subjectsMaps) + .with(predicatesMaps) + .with(objectsMaps) + .with(locations) + .close(); + } + } + + /** + * remove the header of a header id + * + * @param headerID header id + * @return id + */ + public long removeHeader(long headerID) { + return headerID >>> shift; + } + + /** + * extract the subject from an HDT + * + * @param hdtIndex the HDT index + * @param oldID the ID in the HDT triples + * @return ID in the new HDT + */ + public long extractSubject(int hdtIndex, long oldID) { + long headerID = subjectsMaps[hdtIndex].get(oldID); + if (isShared(headerID)) { + return headerID >>> shift; + } + return (headerID >>> shift) + countShared.get(); + } + + /** + * extract the predicate from an HDT + * + * @param hdtIndex the HDT index + * @param oldID the ID in the HDT triples + * @return ID in the new HDT + */ + public long extractPredicate(int hdtIndex, long oldID) { + return predicatesMaps[hdtIndex].get(oldID); + } + + /** + * extract the object from an HDT + * + * @param hdtIndex the HDT index + * @param oldID the ID in the HDT triples + * @return ID in the new HDT + */ + public long extractObject(int hdtIndex, long oldID) { + long headerID = objectsMaps[hdtIndex].get(oldID); + if (isShared(headerID)) { + return headerID >>> shift; + } + if (isTyped(headerID)) { + return (headerID >>> shift) + countShared.get(); + } + return (headerID >>> shift) + countShared.get() + countTyped.get(); + } + + /** + * copy into a new TripleID the mapped version of a tripleID + * + * @param hdtIndex the origin HDT of this tripleID + * @param id the tripleID + * @return mapped tripleID + */ + public TripleID extractMapped(int hdtIndex, TripleID id) { + TripleID mapped = new TripleID( + extractSubject(hdtIndex, id.getSubject()), + extractPredicate(hdtIndex, id.getPredicate()), + extractObject(hdtIndex, id.getObject()) + ); + assert mapped.isValid() : "mapped to empty triples! " + id + " => " + mapped; + return mapped; + } + + /** + * @return the count of shared elements + */ + public long getCountShared() { + return countShared.get(); + } + + /** + * @return subject section + */ + public DictionarySectionPrivate getSectionSubject() { + return sectionSubject; + } + + /** + * @return shared section + */ + public DictionarySectionPrivate getSectionShared() { + return sectionShared; + } + + /** + * @return object section + */ + public DictionarySectionPrivate getSectionObject() { + return sectionObject; + } + + /** + * @return predicate section + */ + public DictionarySectionPrivate getSectionPredicate() { + return sectionPredicate; + } + + /** + * @return sub sections + */ + public TreeMap getSectionSub() { + TreeMap sub = new TreeMap<>(sectionSub); + sub.put(LiteralsUtils.NO_DATATYPE, getSectionObject()); + return sub; + } + + /** + * start the merger threads + */ + public synchronized void startMerger() { + if (running) { + throw new IllegalArgumentException("KCatMerger is already running!"); + } + running = true; + + catMergerThread.startAll(); + } + + static class BiDuplicateBuffer implements Comparable { + private final DuplicateBuffer left; + private final DuplicateBuffer right; + + + public BiDuplicateBuffer(DuplicateBuffer left, DuplicateBuffer right) { + this.left = Objects.requireNonNull(left, "left buffer can't be null!"); + this.right = Objects.requireNonNull(right, "right buffer can't be null!"); + assert left.isEmpty() || right.isEmpty() || left.peek().equals(right.peek()) : "Can't have heterogeneous bi dupe buffer"; + } + + public DuplicateBuffer getLeft() { + return left; + } + + public DuplicateBuffer getRight() { + return right; + } + + public boolean isEmpty() { + return getLeft().isEmpty() && getRight().isEmpty(); + } + + public ByteString peek() { + if (!left.isEmpty()) { + return left.peek(); + } + if (!right.isEmpty()) { + return right.peek(); + } + return null; + } + + @Override + public int compareTo(BiDuplicateBuffer o) { + return peek().compareTo(o.peek()); + } + } + + static class DuplicateBuffer implements Comparable { + private final LocatedIndexedNode[] buffer; + private int used; + + public DuplicateBuffer(int bufferSize) { + this.buffer = new LocatedIndexedNode[bufferSize]; + } + + /** + * add a node to this buffer + * + * @param node node + * @return if this node was added to the buffer + */ + private boolean add(LocatedIndexedNode node) { + // start case + if (isEmpty() || buffer[0].getNode().equals(node.getNode())) { + // we can't have more than buffer size because a source HDT wouldn't be + // without duplicated or a so/sh conflict + buffer[used++] = node; + return true; + } + + return false; + } + + /** + * convert this buffer to a bi duplicate buffer + * + * @param other right part + * @return BiDuplicateBuffer + */ + public BiDuplicateBuffer asBi(DuplicateBuffer other) { + return new BiDuplicateBuffer(this, other); + } + + /** + * @return if this buffer contains at least one element + */ + public boolean isEmpty() { + return used == 0; + } + + /** + * clear the buffer + */ + public void clear() { + // clear old values + for (int i = 0; i < used; i++) { + buffer[i] = null; + } + used = 0; + } + + /** + * @return a stream of the current duplicate objects + */ + public Stream stream() { + return Arrays.stream(buffer, 0, used); + } + + @Override + public int compareTo(DuplicateBuffer o) { + if (isEmpty() || o.isEmpty()) { + throw new IllegalArgumentException("Can't compare empty buffers"); + } + return buffer[0].compareTo(o.buffer[0]); + } + + /** + * @return a trimmed version of this buffer + */ + public DuplicateBuffer trim() { + DuplicateBuffer other = new DuplicateBuffer(used); + System.arraycopy(buffer, 0, other.buffer, 0, used); + other.used = used; + return other; + } + + /** + * @return the buffered byte string, null if empty + */ + public ByteString peek() { + if (isEmpty()) { + return null; + } + return buffer[0].getNode(); + } + + /** + * @return the size of the buffer + */ + public int size() { + return used; + } + } + + static class DuplicateBufferIterator implements ExceptionIterator { + private final ExceptionIterator iterator; + private final DuplicateBuffer buffer; + private LocatedIndexedNode last; + private DuplicateBuffer next; + + public DuplicateBufferIterator(ExceptionIterator iterator, int bufferSize) { + this.iterator = iterator; + buffer = new DuplicateBuffer(bufferSize); + } + + @Override + public boolean hasNext() throws E { + if (next != null) { + return true; + } + + // clear previous buffer + buffer.clear(); + while (true) { + // load an element from the iterator + if (last == null) { + if (!iterator.hasNext()) { + if (buffer.isEmpty()) { + return false; + } + break; + } + last = iterator.next(); + } + + // add the elements from the iterator + if (!buffer.add(last)) { + break; + } + last = null; + } + + next = buffer.trim(); + return true; + } + + @Override + public DuplicateBuffer next() throws E { + if (!hasNext()) { + return null; + } + try { + return next; + } finally { + next = null; + } + } + + } + + private interface MergerFunction { + ExceptionIterator apply(int hdtIndex, T t); + } + + private static class PreIndexSection { + long start; + DictionarySection section; + + public PreIndexSection(long start, DictionarySection section) { + this.start = start; + this.section = section; + } + + public long getStart() { + return start; + } + + public DictionarySection getSection() { + return section; + } + + public Iterator getSortedEntries() { + return getSection().getSortedEntries(); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/LocatedIndexedNode.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/LocatedIndexedNode.java new file mode 100644 index 00000000..fe43e51f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/LocatedIndexedNode.java @@ -0,0 +1,35 @@ +package org.rdfhdt.hdt.dictionary.impl.kcat; + +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.ByteString; + +public class LocatedIndexedNode extends IndexedNode { + private final int hdt; + + public LocatedIndexedNode(int hdt, long index, ByteString string) { + super(string, index); + this.hdt = hdt; + } + + public int getHdt() { + return hdt; + } + + public int compareTo(LocatedIndexedNode o) { + return super.compareTo(o); + } + + @Override + public LocatedIndexedNode clone() { + return (LocatedIndexedNode) super.clone(); + } + + @Override + public String toString() { + return "LocatedIndexedNode{" + + "hdt=" + hdt + + ", index=" + getIndex() + + ", node=" + getNode() + + '}'; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/MultipleSectionDictionaryKCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/MultipleSectionDictionaryKCat.java new file mode 100644 index 00000000..b0afdcc3 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/MultipleSectionDictionaryKCat.java @@ -0,0 +1,80 @@ +package org.rdfhdt.hdt.dictionary.impl.kcat; + +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.DictionaryKCat; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.string.ByteString; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.util.Map; +import java.util.TreeMap; + +public class MultipleSectionDictionaryKCat implements DictionaryKCat { + private final Dictionary dictionary; + + public MultipleSectionDictionaryKCat(Dictionary dictionary) { + this.dictionary = dictionary; + } + + @Override + public Map getSubSections() { + Map sections = new TreeMap<>(CharSequenceComparator.getInstance()); + dictionary.getAllObjects().forEach((key, section) -> { + if (!LiteralsUtils.NO_DATATYPE.equals(key)) { + // we ignore this section because it will be used in the shared compute + sections.put(ByteString.of(key), section); + } + }); + return sections; + } + + @Override + public DictionarySection getSubjectSection() { + return dictionary.getSubjects(); + } + + @Override + public DictionarySection getPredicateSection() { + return dictionary.getPredicates(); + } + + @Override + public DictionarySection getObjectSection() { + return dictionary.getAllObjects().get("NO_DATATYPE"); + } + + @Override + public DictionarySection getSharedSection() { + return dictionary.getShared(); + } + + @Override + public long countSubjects() { + return dictionary.getSubjects().getNumberOfElements() + countShared(); + } + + @Override + public long countShared() { + return dictionary.getShared().getNumberOfElements(); + } + + @Override + public long countPredicates() { + return dictionary.getPredicates().getNumberOfElements(); + } + + @Override + public long countObjects() { + long count = 0; + for (DictionarySection sec : dictionary.getAllObjects().values()) { + count += sec.getNumberOfElements(); + } + return count + countShared(); + } + + @Override + public long objectShift() { + return countObjects() - getObjectSection().getNumberOfElements(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java index b6e5f06d..0ae957cd 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java @@ -74,16 +74,16 @@ public void load(Iterator it, long count, ProgressListen blocks.append(out.getTotalBytes()); // Copy full string - ByteStringUtil.append(out, str, 0); + ByteStringUtil.append(crcout, str, 0); } else { // Find common part. int delta = ByteStringUtil.longestCommonPrefix(previousStr, str); // Write Delta in VByte - VByte.encode(out, delta); + VByte.encode(crcout, delta); // Write remaining - ByteStringUtil.append(out, str, delta); + ByteStringUtil.append(crcout, str, delta); } - out.write(0); + crcout.write(0); previousStr = str; numberElements++; if (currentCount % block == 0) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 46f77c67..86eefc5b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -2,6 +2,7 @@ import org.rdfhdt.hdt.compact.bitmap.Bitmap; import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.kcat.KCatImpl; import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotFoundException; @@ -56,13 +57,13 @@ public HDTOptions doReadOptions(String file) throws IOException { return new HDTSpecification(file); } - private HDT loadOrMapHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException { + public static HDT loadOrMapHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException { String loadingMethod = spec.get(HDTOptionsKeys.LOAD_HDT_TYPE_KEY); if (loadingMethod == null || loadingMethod.isEmpty() || HDTOptionsKeys.LOAD_HDT_TYPE_VALUE_MAP.equals(loadingMethod)) { - return doMapHDT(hdtFileName, listener, spec); + return mapHDT(hdtFileName, listener, spec); } if (HDTOptionsKeys.LOAD_HDT_TYPE_VALUE_LOAD.equals(loadingMethod)) { - return doLoadHDT(hdtFileName, listener, spec); + return loadHDT(hdtFileName, listener, spec); } throw new IllegalArgumentException("Bad loading method: " + loadingMethod); } @@ -479,6 +480,20 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< } } + @Override + protected HDT doHDTCat(List hdtFileNames, HDTOptions hdtFormat, ProgressListener listener) throws IOException { + if (hdtFileNames.isEmpty()) { + return HDTFactory.createHDT(hdtFormat); + } + if (hdtFileNames.size() == 1) { + return loadOrMapHDT(hdtFileNames.get(0), listener, hdtFormat); + } + + try (KCatImpl kCat = new KCatImpl(hdtFileNames, hdtFormat, listener)) { + return kCat.cat(); + } + } + private static class HDTFile { private final Path hdtFile; private final long chunks; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java index fe1baeab..2c2befe0 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java @@ -1,6 +1,7 @@ package org.rdfhdt.hdt.hdt.impl; import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.hdt.HDTPrivate; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.header.Header; @@ -9,6 +10,7 @@ import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.triples.IteratorTripleString; import org.rdfhdt.hdt.triples.TriplesPrivate; import org.rdfhdt.hdt.util.StringUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -91,6 +93,10 @@ public long size() { return dictionary.size() + triples.size(); } + public HDTOptions getSpec() { + return spec; + } + /* * (non-Javadoc) * @@ -119,6 +125,22 @@ public void saveToHDT(OutputStream output, ProgressListener listener) throws IOE triples.save(output, ci, iListener); } + public static long getRawSize(Header header) { + + try { + IteratorTripleString rawSize1 = header.search("_:statistics", HDTVocabulary.ORIGINAL_SIZE, ""); + if (!rawSize1.hasNext()) { + return -1; + } + + CharSequence obj = rawSize1.next().getObject(); + // remove "s in "" + return Long.parseLong(obj, 1, obj.length() - 1, 10); + } catch (NotFoundException e) { + return -1; + } + } + @Override public void populateHeaderStructure(String baseUri) { if (baseUri == null || baseUri.length() == 0) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java index d2f1831a..d0bbc8eb 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java @@ -557,22 +557,6 @@ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener, profiler.popSection(); } - public static long getRawSize(Header header) { - - try { - IteratorTripleString rawSize1 = header.search("_:statistics", HDTVocabulary.ORIGINAL_SIZE, ""); - if (!rawSize1.hasNext()) { - return -1; - } - - CharSequence obj = rawSize1.next().getObject(); - // remove "s in "" - return Long.parseLong(obj, 1, obj.length() - 1, 10); - } catch (NotFoundException e) { - return -1; - } - } - public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener listener, Profiler profiler) throws IOException { if (listener != null) { listener.notifyProgress(0, "Generating dictionary"); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java index 76d6599c..a912def6 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java @@ -2,16 +2,13 @@ import org.rdfhdt.hdt.dictionary.DictionaryFactory; import org.rdfhdt.hdt.dictionary.DictionaryPrivate; -import org.rdfhdt.hdt.dictionary.impl.WriteFourSectionDictionary; -import org.rdfhdt.hdt.dictionary.impl.WriteMultipleSectionDictionary; import org.rdfhdt.hdt.exceptions.NotImplementedException; -import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.header.HeaderFactory; import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; -import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.TriplesPrivate; import org.rdfhdt.hdt.triples.impl.WriteBitmapTriples; import org.rdfhdt.hdt.util.io.CloseSuppressPath; import org.rdfhdt.hdt.util.io.IOUtil; @@ -29,7 +26,7 @@ * * @author Antoine Willerval */ -public class WriteHDTImpl extends HDTBase { +public class WriteHDTImpl extends HDTBase { private String baseURI; private final CloseSuppressPath workingLocation; private boolean isClosed; @@ -45,6 +42,17 @@ public WriteHDTImpl(HDTOptions spec, CloseSuppressPath workingLocation, int buff // small, can use default implementation header = HeaderFactory.createHeader(this.spec); } + public WriteHDTImpl(HDTOptions spec, CloseSuppressPath workingLocation, DictionaryPrivate dict, TriplesPrivate triples, HeaderPrivate header) throws IOException { + super(spec); + this.workingLocation = workingLocation; + workingLocation.mkdirs(); + + dictionary = dict; + // we need to have the bitmaps in memory, so we can't bypass the implementation + this.triples = triples; + // small, can use default implementation + this.header = header; + } @Override public void setBaseUri(String baseURI) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java index ca933bc9..6c2ac73b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java @@ -5,11 +5,20 @@ /** * Iterator to map a value to another + * * @param origin type * @param return type * @author Antoine Willerval */ public class MapIterator implements Iterator { + public static MapIterator of(Iterator base, Function mappingFunction) { + return new MapIterator<>(base, mappingFunction); + } + + public static MapIterator of(Iterator base, MapWithIdFunction mappingFunction) { + return new MapIterator<>(base, mappingFunction); + } + private final MapWithIdFunction mappingFunction; private final Iterator base; private long index; @@ -17,6 +26,7 @@ public class MapIterator implements Iterator { public MapIterator(Iterator base, Function mappingFunction) { this(base, (m, i) -> mappingFunction.apply(m)); } + public MapIterator(Iterator base, MapWithIdFunction mappingFunction) { this.base = base; this.mappingFunction = mappingFunction; @@ -37,6 +47,13 @@ public void remove() { base.remove(); } + /** + * @return this iterator, but as an exception iterator + */ + public ExceptionIterator asExceptionIterator() { + return ExceptionIterator.of(this); + } + @FunctionalInterface public interface MapWithIdFunction { E apply(T element, long index); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java index a24d9e81..0074c5a3 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java @@ -3,19 +3,21 @@ import java.util.Arrays; import java.util.Comparator; import java.util.List; +import java.util.function.BiFunction; import java.util.function.Function; public class MergeExceptionIterator implements ExceptionIterator { /** * Create a tree of merge iterators from an array of element + * * @param itFunction a function to create an iterator from an element - * @param comp comparator for the merge iterator - * @param array the elements - * @param length the number of elements - * @param input of the element - * @param type of the element in the iterator - * @param exception returned by the iterator + * @param comp comparator for the merge iterator + * @param array the elements + * @param length the number of elements + * @param input of the element + * @param type of the element in the iterator + * @param exception returned by the iterator * @return the iterator */ public static ExceptionIterator buildOfTree( @@ -25,13 +27,14 @@ public static ExceptionIterator buildOfTree( /** * Create a tree of merge iterators from an array of element + * * @param itFunction a function to create an iterator from an element - * @param comp comparator for the merge iterator - * @param array the elements - * @param start the start of the array (inclusive) - * @param end the end of the array (exclusive) - * @param type of the element - * @param exception returned by the iterator + * @param comp comparator for the merge iterator + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator * @return the iterator */ public static ExceptionIterator buildOfTree( @@ -41,23 +44,83 @@ public static ExceptionIterator buildOfTree( /** * Create a tree of merge iterators from an array of element + * * @param itFunction a function to create an iterator from an element - * @param comp comparator for the merge iterator + * @param comp comparator for the merge iterator + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, Comparator comp, List array, int start, int end) { + return buildOfTree((index, o) -> itFunction.apply(o), comp, array, start, end); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param itFunction a function to create an iterator from an element + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static , E extends Exception> ExceptionIterator buildOfTree( + Function> itFunction, List array, int start, int end) { + return buildOfTree((index, o) -> itFunction.apply(o), Comparable::compareTo, array, start, end); + } + + /** + * Create a tree of merge iterators from an array of element + * * @param array the elements * @param start the start of the array (inclusive) - * @param end the end of the array (exclusive) - * @param type of the element - * @param exception returned by the iterator + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static , E extends Exception> ExceptionIterator buildOfTree(List> array, int start, int end) { + return MergeExceptionIterator.buildOfTree(Function.identity(), Comparable::compareTo, array, start, end); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param array the elements + * @param type of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static , E extends Exception> ExceptionIterator buildOfTree(List> array) { + return MergeExceptionIterator.buildOfTree(Function.identity(), Comparable::compareTo, array, 0, array.size()); + } + + /** + * Create a tree of merge iterators from an array of element + * + * @param itFunction a function to create an iterator from an element + * @param comp comparator for the merge iterator + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator * @return the iterator */ public static ExceptionIterator buildOfTree( - Function> itFunction, Comparator comp, List array, int start, int end) { + BiFunction> itFunction, Comparator comp, List array, int start, int end) { int length = end - start; if (length <= 0) { return ExceptionIterator.empty(); } if (length == 1) { - return itFunction.apply(array.get(start)); + return itFunction.apply(start, array.get(start)); } int mid = (start + end) / 2; return new MergeExceptionIterator<>( diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HDTOptionsBase.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HDTOptionsBase.java index 4e28a1ac..d24d720e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HDTOptionsBase.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HDTOptionsBase.java @@ -28,6 +28,7 @@ package org.rdfhdt.hdt.options; import java.util.Properties; +import java.util.Set; /** * @author mario.arias @@ -80,7 +81,10 @@ public void setInt(String key, long value) { properties.setProperty(key, Long.toString(value)); } - + @Override + public Set getKeys() { + return properties.keySet(); + } @Override public void clear() { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HideHDTOptions.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HideHDTOptions.java index dc557c15..5257675a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HideHDTOptions.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/options/HideHDTOptions.java @@ -1,6 +1,8 @@ package org.rdfhdt.hdt.options; +import java.util.HashSet; import java.util.Objects; +import java.util.Set; import java.util.function.Function; /** @@ -21,6 +23,11 @@ public HideHDTOptions(HDTOptions spec, Function mapper) { this.mapper = mapper; } + @Override + public Set getKeys() { + return spec.getKeys(); + } + private String map(String key) { return Objects.requireNonNullElse(mapper.apply(key), ""); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java index a76c6d9d..9a1bb725 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java @@ -2,11 +2,12 @@ import org.rdfhdt.hdt.util.string.ByteString; import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.CompactString; import java.util.Comparator; -public final class IndexedNode implements Comparable { +public class IndexedNode implements Comparable, Cloneable { private ByteString node; private long index; @@ -41,4 +42,16 @@ public void setNode(ByteString node) { public int compareTo(IndexedNode o) { return node.compareTo(o.node); } + + + @Override + public IndexedNode clone() { + try { + IndexedNode clone = (IndexedNode) super.clone(); + clone.node = new CompactString(node); + return clone; + } catch (CloneNotSupportedException e) { + throw new AssertionError(e); + } + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncSeq.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncSeq.java new file mode 100644 index 00000000..46df7bb2 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncSeq.java @@ -0,0 +1,32 @@ +package org.rdfhdt.hdt.util.concurrent; + +import org.rdfhdt.hdt.compact.sequence.DynamicSequence; + +import java.io.Closeable; +import java.io.IOException; + +/** + * Class to synchronize a {@link org.rdfhdt.hdt.compact.sequence.DynamicSequence} map + * + * @author Antoine Willerval + */ +public class SyncSeq implements Closeable { + private final DynamicSequence seq; + + public SyncSeq(DynamicSequence seq) { + this.seq = seq; + } + + public synchronized long get(long index) { + return seq.get(index); + } + + public synchronized void set(long index, long value) { + seq.set(index, value); + } + + @Override + public synchronized void close() throws IOException { + seq.close(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/Closer.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/Closer.java new file mode 100644 index 00000000..cfe970c0 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/Closer.java @@ -0,0 +1,63 @@ +package org.rdfhdt.hdt.util.io; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +/** + * Class to close many {@link java.io.Closeable} objects at once without having to do a large try-finally tree + * + * @author Antoine Willerval + */ +public class Closer implements Iterable, Closeable { + private final List list; + + private Closer(Closeable... other) { + list = new ArrayList<>(Arrays.asList(other)); + } + + /** + * create closer with closeables + * @param other closeables + * @return closer + */ + public static Closer of(Closeable... other) { + return new Closer(other); + } + + /** + * add closeables to this closer + * + * @param other closeables + * @return this + */ + public Closer with(Closeable... other) { + return with(List.of(other)); + } + + /** + * add closeables iterable to this closer + * + * @param iterable closeables + * @return this + */ + public Closer with(Iterable iterable) { + list.addAll(StreamSupport.stream(iterable.spliterator(), false).collect(Collectors.toList())); + return this; + } + + @Override + public Iterator iterator() { + return list.iterator(); + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(list); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMergerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMergerTest.java new file mode 100644 index 00000000..d1be2e0e --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMergerTest.java @@ -0,0 +1,318 @@ +package org.rdfhdt.hdt.dictionary.impl.kcat; + +import org.apache.commons.io.file.PathUtils; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.rdfhdt.hdt.compact.sequence.DynamicSequence; +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySection; +import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.exceptions.NotFoundException; +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdt.HDTManagerTest; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsBase; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; +import org.rdfhdt.hdt.util.concurrent.SyncSeq; +import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; +import org.rdfhdt.hdt.util.io.Closer; +import org.rdfhdt.hdt.util.string.ByteString; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +@RunWith(Parameterized.class) +public class KCatMergerTest extends AbstractMapMemoryTest { + @Parameterized.Parameters(name = "multi: {0}, unicode: {1}, map: {2}") + public static Collection params() { + return Stream.of(false, true).flatMap( + multi -> Stream.of(false, true).flatMap( + unicode -> Stream.of(false, true).map( + map -> new Object[]{multi, unicode, map} + ) + ) + ).collect(Collectors.toList()); + } + + @Parameterized.Parameter + public boolean multi; + @Parameterized.Parameter(1) + public boolean unicode; + @Parameterized.Parameter(2) + public boolean map; + + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + private void writeSection(DictionarySection sec, OutputStream stream) throws IOException { + ((DictionarySectionPrivate) sec).save(stream, null); + } + + private DictionarySection loadSection(InputStream stream) throws IOException { + PFCDictionarySection section = new PFCDictionarySection(new HDTOptionsBase()); + section.load(stream, null); + return section; + } + + private Map loadMultiSection(List seq, InputStream stream) throws IOException { + Map sectionMap = new TreeMap<>(); + for (CharSequence key : seq) { + PFCDictionarySection section = new PFCDictionarySection(new HDTOptionsBase()); + section.load(stream, null); + sectionMap.put(ByteString.of(key), section); + } + return sectionMap; + } + + @Test + public void mergerTest() throws ParserException, IOException, InterruptedException { + Path root = tempDir.getRoot().toPath(); + try { + int count = 10; + HDTOptions spec = new HDTOptionsBase(); + + if (multi) { + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); + } + + // create "count" fake HDTs + LargeFakeDataSetStreamSupplier s = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(1000, 42) + .withMaxElementSplit(50) + .withUnicode(unicode); + + List hdts = new ArrayList<>(); + for (int i = 0; i < count; i++) { + String location = root.resolve("hdt" + i + ".hdt").toAbsolutePath().toString(); + hdts.add(location); + s.createAndSaveFakeHDT(spec, location); + } + + // create the excepted HDT from previous algorithm + Path fatcathdt = root.resolve("fatcat.hdt"); + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(1000 * count, 42) + .withMaxElementSplit(50) + .withUnicode(unicode) + .createAndSaveFakeHDT(spec, fatcathdt.toAbsolutePath().toString()); + + // create dictionary and write sections + Path dictFile = root.resolve("dict"); + List sub = new ArrayList<>(); + try (KCatImpl impl = new KCatImpl(hdts, spec, null)) { + try (KCatMerger merger = impl.createMerger(null)) { + assertEquals(multi, merger.typedHDT); + merger.startMerger(); + // create + DictionaryPrivate dict = merger.buildDictionary(); + try (OutputStream stream = new BufferedOutputStream(Files.newOutputStream(dictFile))) { + writeSection(dict.getShared(), stream); + writeSection(dict.getSubjects(), stream); + writeSection(dict.getPredicates(), stream); + if (multi) { + for (Map.Entry e : dict.getAllObjects().entrySet()) { + CharSequence key = e.getKey(); + sub.add(key); + DictionarySection sec = e.getValue(); + writeSection(sec, stream); + } + } else { + writeSection(dict.getObjects(), stream); + } + } + + // check if all the dynamic sequences are filled + + SyncSeq[] sms = merger.subjectsMaps; + SyncSeq[] pms = merger.predicatesMaps; + SyncSeq[] oms = merger.objectsMaps; + + AtomicLong[] objectCounts = merger.countObject; + AtomicLong[] subjectCounts = merger.countSubject; + + for (int hdtId = 1; hdtId <= impl.hdts.length; hdtId++) { + HDT hdt = impl.hdts[hdtId - 1]; + SyncSeq sm = sms[hdtId - 1]; + SyncSeq pm = pms[hdtId - 1]; + SyncSeq om = oms[hdtId - 1]; + + AtomicLong objectCount = objectCounts[hdtId - 1]; + AtomicLong subjectCount = subjectCounts[hdtId - 1]; + + long shared = hdt.getDictionary().getShared().getNumberOfElements(); + long subjects = hdt.getDictionary().getSubjects().getNumberOfElements(); + long predicates = hdt.getDictionary().getPredicates().getNumberOfElements(); + long objects = + multi ? hdt.getDictionary().getAllObjects().values().stream().mapToLong(DictionarySection::getNumberOfElements).sum() + : hdt.getDictionary().getObjects().getNumberOfElements(); + + assertEquals(shared + objects, objectCount.get()); + assertEquals(shared + subjects, subjectCount.get()); + + for (long i = 1; i <= shared; i++) { + long sv = sm.get(i); + long ov = om.get(i); + if (merger.removeHeader(sv) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing shared subject #" + i + "/" + shared + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.SUBJECT)); + } + if (merger.removeHeader(ov) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing shared object #" + i + "/" + shared + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.OBJECT)); + } + + assertEquals("shared element not mapped to the same object", ov, sv); + assertTrue("shared mapped element isn't shared", merger.isShared(ov)); + } + + for (long i = 1; i <= subjects; i++) { + if (merger.removeHeader(sm.get(shared + i)) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing subject #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i + shared, TripleComponentRole.SUBJECT)); + } + } + + for (long i = 1; i <= objects; i++) { + if (merger.removeHeader(om.get(shared + i)) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing object #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i + shared, TripleComponentRole.OBJECT)); + } + } + + for (long i = 1; i <= predicates; i++) { + if (pm.get(i) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing predicate #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.PREDICATE)); + } + } + + } + } + } + try (InputStream stream = new BufferedInputStream(Files.newInputStream(dictFile))) { + // read the sections + try (DictionarySection sh = loadSection(stream); + DictionarySection su = loadSection(stream); + DictionarySection pr = loadSection(stream)) { + Map dictionarySectionMap; + DictionarySection ob; + if (multi) { + ob = null; + dictionarySectionMap = loadMultiSection(sub, stream); + } else { + dictionarySectionMap = Map.of(); + ob = loadSection(stream); + } + try { + // map the excepted hdt + try (HDT exceptedHDT = HDTManager.mapHDT(fatcathdt.toAbsolutePath().toString())) { + Dictionary exceptedDict = exceptedHDT.getDictionary(); + assertNotEquals("Invalid test, shared section empty", 0, exceptedHDT.getDictionary().getShared().getNumberOfElements()); + // assert equals between the dictionaries + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Shared", exceptedDict.getShared(), sh); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Subjects", exceptedDict.getSubjects(), su); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Predicates", exceptedDict.getPredicates(), pr); + if (multi) { + Map exceptedDictSub = exceptedDict.getAllObjects(); + dictionarySectionMap.forEach((key, sec) -> { + DictionarySection subSec = exceptedDictSub.get(key); + assertNotNull("sub#" + key + " wasn't found", subSec); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Section#" + key, subSec, sec); + }); + } else { + assert ob != null; + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Objects", exceptedDict.getObjects(), ob); + } + } + } finally { + Closer + .of(ob) + .with(dictionarySectionMap.values()) + .close(); + } + } + } + } finally { + PathUtils.deleteDirectory(root); + } + } + + @Test + public void catTest() throws ParserException, IOException, NotFoundException { + Path root = tempDir.getRoot().toPath(); + try { + // number of HDTs + int count = 10; + long countPerHDT = 1000; + + // create the config + HDTOptions spec = new HDTOptionsBase(); + if (multi) { + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); + } + + if (map) { + spec.set(HDTOptionsKeys.HDTCAT_FUTURE_LOCATION, root.resolve("futurehc.hdt").toAbsolutePath()); + } + + // create "count" fake HDTs + LargeFakeDataSetStreamSupplier s = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(countPerHDT, 42) + .withMaxElementSplit(50) + .withUnicode(unicode); + + List hdts = new ArrayList<>(); + for (int i = 0; i < count; i++) { + String location = root.resolve("hdt" + i + ".hdt").toAbsolutePath().toString(); + hdts.add(location); + s.createAndSaveFakeHDT(spec, location); + } + + // create the excepted HDT from previous algorithm + Path fatcathdt = root.resolve("fatcat.hdt"); + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(countPerHDT * count, 42) + .withMaxElementSplit(50) + .withUnicode(unicode) + .createAndSaveFakeHDT(spec, fatcathdt.toAbsolutePath().toString()); + + // create dictionary and write sections + // map the excepted hdt + try (HDT actualHDT = HDTManager.catHDT(hdts, spec, null)) { + try (HDT exceptedHDT = HDTManager.mapHDT(fatcathdt.toAbsolutePath().toString())) { + // assert equals between the dictionaries + assertNotEquals(0, actualHDT.getDictionary().getShared().getNumberOfElements()); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT(exceptedHDT, actualHDT); + } + } + } finally { + PathUtils.deleteDirectory(root); + } + } + +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java index d6edc1ef..614d77fc 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -15,6 +15,7 @@ import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.impl.MultipleBaseDictionary; +import org.rdfhdt.hdt.dictionary.impl.kcat.LocatedIndexedNode; import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotFoundException; @@ -156,8 +157,9 @@ public static void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundExce TripleID expectedTriple = expectedIt.next(); TripleID actualTriple = actualIt.next(); - assertEquals(expectedIt.getLastTriplePosition(), actualIt.getLastTriplePosition()); - assertEquals(expectedTriple, actualTriple); + long location = expectedIt.getLastTriplePosition(); + assertEquals("The tripleID location doesn't match", location, actualIt.getLastTriplePosition()); + assertEquals("The tripleID #" + location + " doesn't match", expectedTriple, actualTriple); } assertFalse(actualIt.hasNext()); @@ -236,11 +238,10 @@ public static void checkHDTConsistency(HDT hdt) { }); } - protected static void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { + public static void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { assertEquals("sizes of section " + section + " aren't the same!", excepted.getNumberOfElements(), actual.getNumberOfElements()); Iterator itEx = excepted.getSortedEntries(); Iterator itAc = actual.getSortedEntries(); - assertEquals("dictionary section sizes don't match", excepted.getNumberOfElements(), actual.getNumberOfElements()); while (itEx.hasNext()) { assertTrue("dictionary section " + section + " is less big than excepted", itAc.hasNext()); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java index d5ff8912..05460f0f 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdtDiff/HdtDiffTest.java @@ -173,7 +173,7 @@ public static void assertHdtEquals(HDT hdt1, HDT hdt2) { try { its1 = hdt1.search("", "", ""); - its2 = hdt1.search("", "", ""); + its2 = hdt2.search("", "", ""); } catch (NotFoundException e) { throw new AssertionError(e); } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index a6729856..b67c1ff4 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -116,6 +116,8 @@ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long m private long slowStream; private boolean sameTripleString; private boolean unicode; + private TripleString buffer; + private TripleString next; private LargeFakeDataSetStreamSupplier(long maxSize, long maxTriples, long seed) { this.maxSize = maxSize; @@ -129,6 +131,8 @@ private LargeFakeDataSetStreamSupplier(long maxSize, long maxTriples, long seed) */ public void reset() { random = new Random(seed); + next = null; + buffer = null; } /** @@ -322,17 +326,21 @@ private CharSequence createValue() { private class FakeStatementIterator implements Iterator { private long size; private long count = 0; - private TripleString buffer; - private TripleString next; + private boolean init; FakeStatementIterator() { - if (sameTripleString) { - buffer = new TripleString(); - } } @Override public boolean hasNext() { + if (!init) { + init = true; + if (next != null) { + long estimation = estimateTripleSize(next); + size += estimation; + count++; + } + } if (size >= maxSize || count > maxTriples) { return false; } @@ -379,8 +387,8 @@ public TripleString next() { if (!hasNext()) { return null; } - TripleString next = this.next; - this.next = null; + TripleString next = LargeFakeDataSetStreamSupplier.this.next; + LargeFakeDataSetStreamSupplier.this.next = null; return next; } } @@ -449,6 +457,11 @@ public LargeFakeDataSetStreamSupplier withSlowStream(long slowStream) { */ public LargeFakeDataSetStreamSupplier withSameTripleString(boolean sameTripleString) { this.sameTripleString = sameTripleString; + if (sameTripleString) { + buffer = new TripleString(); + } else { + buffer = null; + } return this; } diff --git a/hdt-java-package/bin/javaenv.bat b/hdt-java-package/bin/javaenv.bat index e8626431..e888e622 100644 --- a/hdt-java-package/bin/javaenv.bat +++ b/hdt-java-package/bin/javaenv.bat @@ -1,2 +1,3 @@ set JAVAOPTIONS=-Xmx1G set JAVACMD=java +set RDFHDT_COLOR=false diff --git a/hdt-java-package/bin/javaenv.sh b/hdt-java-package/bin/javaenv.sh index 46899d06..c384cc22 100644 --- a/hdt-java-package/bin/javaenv.sh +++ b/hdt-java-package/bin/javaenv.sh @@ -21,6 +21,11 @@ else JAVA="$JAVA_HOME/bin/java -server" fi +# Set HDT Color options, set to true to allow color +if [ "$RDFHDT_COLOR" = "" ] ; then + export RDFHDT_COLOR="false" +fi + # Set Java options if [ "$JAVA_OPTIONS" = "" ] ; then JAVA_OPTIONS="-Xmx1g" From eac943b6ef4ae65ab6e0711f6d115aafc0939cf6 Mon Sep 17 00:00:00 2001 From: qaate47 Date: Wed, 30 Nov 2022 19:56:13 +0100 Subject: [PATCH 2/2] Add k-HDTCat to HDTCat-Tree with profiling --- .../rdfhdt/hdt/options/HDTOptionsKeys.java | 6 + .../java/org/rdfhdt/hdt/util/Profiler.java | 1 + hdt-java-cli/bin/hdtCat.bat | 5 + .../compact/sequence/SequenceLog64Map.java | 3 +- .../impl/kcat/GroupBySubjectMapIterator.java | 42 +- .../hdt/dictionary/impl/kcat/KCatImpl.java | 43 +- .../hdt/dictionary/impl/kcat/KCatMerger.java | 25 +- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 142 ++++- .../hdt/iterator/utils/ExceptionIterator.java | 21 + .../dictionary/impl/kcat/KCatMergerTest.java | 551 +++++++++--------- .../org/rdfhdt/hdt/hdt/HDTManagerTest.java | 29 +- .../util/LargeFakeDataSetStreamSupplier.java | 44 +- .../LargeFakeDataSetStreamSupplierTest.java | 99 ++++ hdt-java-package/bin/hdtCat.bat | 5 + 14 files changed, 665 insertions(+), 351 deletions(-) create mode 100644 hdt-java-cli/bin/hdtCat.bat create mode 100644 hdt-java-package/bin/hdtCat.bat diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java index a795da1c..d41b2b0b 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java @@ -140,6 +140,12 @@ public class HDTOptionsKeys { */ @Key(type = Key.Type.DOUBLE, desc = "Memory fault factor for HDTCat tree method split") public static final String LOADER_CATTREE_MEMORY_FAULT_FACTOR = "loader.cattree.memoryFaultFactor"; + /** + * Key for the k-merge HDTCat for the {@link org.rdfhdt.hdt.hdt.HDTManager} catTree default to 2 using default + * implementation of HDTCat, not K-HDTCat + */ + @Key(type = Key.Type.NUMBER, desc = "Number of HDT to merge at the same time with K-HDTCat, by default it use the default HDTCat implementation") + public static final String LOADER_CATTREE_KCAT = "loader.cattree.kcat"; /** * Key for the hdt supplier type, default to memory diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/util/Profiler.java b/hdt-api/src/main/java/org/rdfhdt/hdt/util/Profiler.java index 12e811d6..4a4f041b 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/util/Profiler.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/util/Profiler.java @@ -270,6 +270,7 @@ public void writeToDisk(Path outputPath) throws IOException { public Section getMainSection() { if (this.mainSection == null) { this.mainSection = new Section(name); + maxSize = Math.max(name.length() + deep * 2, maxSize); } return this.mainSection; } diff --git a/hdt-java-cli/bin/hdtCat.bat b/hdt-java-cli/bin/hdtCat.bat new file mode 100644 index 00000000..b2b8d90b --- /dev/null +++ b/hdt-java-cli/bin/hdtCat.bat @@ -0,0 +1,5 @@ +@echo off + +call "%~dp0\javaenv.bat" + +"%JAVACMD%" %JAVAOPTIONS% -classpath %~dp0\..\lib\* org.rdfhdt.hdt.tools.HDTCat %* diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64Map.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64Map.java index b98c0b46..3bdb5d2c 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64Map.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64Map.java @@ -79,7 +79,6 @@ public SequenceLog64Map(CountInputStream in, File f) throws IOException { this(in, f, false); } - @SuppressWarnings("resource") private SequenceLog64Map(CountInputStream in, File f, boolean closeInput) throws IOException { CRCInputStream crcin = new CRCInputStream(in, new CRC8()); @@ -189,7 +188,7 @@ private long getWord(long w) { @Override public long get(long index) { if(index<0 || index>=numentries) { - throw new IndexOutOfBoundsException(); + throw new IndexOutOfBoundsException(index + " < 0 || " + index + ">= " + numentries); } if(numbits==0) return 0; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/GroupBySubjectMapIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/GroupBySubjectMapIterator.java index 17af745c..c9454d50 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/GroupBySubjectMapIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/GroupBySubjectMapIterator.java @@ -86,6 +86,10 @@ public TripleID next() { } private static long firstSubjectTripleId(HDT hdt) { + if (hdt.getDictionary().getSubjects().getNumberOfElements() == 0) { + // no subjects + return -1; + } IteratorTripleID it = hdt.getTriples().search(new TripleID( hdt.getDictionary().getNshared() + 1, 0, @@ -107,21 +111,31 @@ public static Iterator fromHDTs(KCatMerger merger, HDT[] hdts) { // extract hdt elements for this index HDT hdt = hdts[hdtIndex]; + if (hdt.getTriples().getNumberOfElements() == 0) { + // no triples + return ExceptionIterator.empty(); + } // get the first subject triple id long firstSubjectTripleId = firstSubjectTripleId(hdt); - // create a subject iterator, mapped to the new IDs - IteratorTripleID subjectIterator = hdt.getTriples().searchAll(); - subjectIterator.goTo(firstSubjectTripleId); - ExceptionIterator subjectIteratorMapped = ExceptionIterator.of( - new SharedOnlyIterator( - new MapIterator<>(subjectIterator, (tid) -> { - assert inHDT(tid, hdts[hdtIndex]); - return merger.extractMapped(hdtIndex, tid); - }), - shared - ) - ); + ExceptionIterator subjectIteratorMapped; + if (firstSubjectTripleId == -1) { + // no triples + subjectIteratorMapped = ExceptionIterator.empty(); + } else { + // create a subject iterator, mapped to the new IDs + IteratorTripleID subjectIterator = hdt.getTriples().searchAll(); + subjectIterator.goTo(firstSubjectTripleId); + subjectIteratorMapped = ExceptionIterator.of( + new SharedOnlyIterator( + new MapIterator<>(subjectIterator, (tid) -> { + assert inHDT(tid, hdts[hdtIndex]); + return merger.extractMapped(hdtIndex, tid); + }), + shared + ) + ); + } if (shared == 0) { return subjectIteratorMapped; @@ -147,6 +161,10 @@ public static Iterator fromHDTs(KCatMerger merger, HDT[] hdts) { // get the first subject triple id long firstSubjectTripleId = firstSubjectTripleId(hdt); + if (firstSubjectTripleId == -1) { + return ExceptionIterator.empty(); + } + // create a subject iterator, mapped to the new IDs IteratorTripleID subjectIterator = hdt.getTriples().searchAll(); subjectIterator.goTo(firstSubjectTripleId); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatImpl.java index 82136015..85a53789 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatImpl.java @@ -9,6 +9,7 @@ import org.rdfhdt.hdt.hdt.impl.HDTBase; import org.rdfhdt.hdt.hdt.impl.WriteHDTImpl; import org.rdfhdt.hdt.header.HeaderFactory; +import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTOptionsKeys; @@ -17,6 +18,7 @@ import org.rdfhdt.hdt.triples.impl.BitmapTriples; import org.rdfhdt.hdt.triples.impl.OneReadTempTriples; import org.rdfhdt.hdt.triples.impl.WriteBitmapTriples; +import org.rdfhdt.hdt.util.Profiler; import org.rdfhdt.hdt.util.io.CloseSuppressPath; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -54,12 +56,13 @@ private static TripleComponentOrder getOrder(HDT hdt) { private final Path futureLocation; private final boolean futureMap; private final boolean clearLocation; - private final ProgressListener listener; + private final MultiThreadListener listener; private final String dictionaryType; private final int bufferSize; private final HDTOptions hdtFormat; private final TripleComponentOrder order; private final long rawSize; + private final Profiler profiler; /** * Create implementation @@ -82,6 +85,8 @@ public KCatImpl(List hdtFileNames, HDTOptions hdtFormat, ProgressListene bufferSize = (int) bufferSizeLong; } + profiler = Profiler.createOrLoadSubSection("doHDTCatk", hdtFormat, true); + try { ListIterator it = hdtFileNames.listIterator(); @@ -147,10 +152,14 @@ public KCatImpl(List hdtFileNames, HDTOptions hdtFormat, ProgressListene location.closeWithDeleteRecurse(); } catch (Throwable t) { - for (HDT hdt : hdts) { - IOUtil.closeQuietly(hdt); + try { + throw t; + } finally { + for (HDT hdt : hdts) { + IOUtil.closeQuietly(hdt); + } + profiler.close(); } - throw t; } } @@ -172,9 +181,13 @@ public HDT cat() throws IOException { il.setRange(0, 40); il.setPrefix("Merge Dict: "); try (KCatMerger merger = createMerger(il)) { + profiler.pushSection("dict"); // create the dictionary try (DictionaryPrivate dictionary = merger.buildDictionary()) { + profiler.popSection(); assert merger.assertReadCorrectly(); + listener.unregisterAllThreads(); + profiler.pushSection("triples"); // create a GROUP BY subject iterator to get the new ordered stream Iterator tripleIterator = GroupBySubjectMapIterator.fromHDTs(merger, hdts); try (WriteBitmapTriples triples = new WriteBitmapTriples(hdtFormat, location.resolve("triples"), bufferSize)) { @@ -182,21 +195,28 @@ public HDT cat() throws IOException { il.setRange(40, 80); il.setPrefix("Merge triples: "); + il.notifyProgress(0, "start"); triples.load(new OneReadTempTriples(tripleIterator, order, count), il); + profiler.popSection(); WriteHDTImpl writeHDT = new WriteHDTImpl(hdtFormat, location, dictionary, triples, HeaderFactory.createHeader(hdtFormat)); + profiler.pushSection("header"); writeHDT.populateHeaderStructure(baseURI); // add a raw size from the previous values (if available) if (rawSize != -1) { writeHDT.getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, String.valueOf(rawSize)); } + profiler.popSection(); + profiler.pushSection("save"); il.setRange(80, 90); il.setPrefix("Save HDT: "); + il.notifyProgress(0, "save to " + futureLocationStr); writeHDT.saveToHDT(futureLocationStr, il); + profiler.popSection(); } } - + listener.unregisterAllThreads(); } catch (InterruptedException e) { throw new IOException("Interruption", e); } @@ -204,8 +224,10 @@ public HDT cat() throws IOException { il.setRange(90, 100); HDT hdt; if (futureMap) { + il.notifyProgress(0, "map hdt"); hdt = HDTManager.mapHDT(futureLocationStr, il); } else { + il.notifyProgress(0, "load hdt"); hdt = HDTManager.loadHDT(futureLocationStr, il); Files.deleteIfExists(futureLocation); } @@ -216,7 +238,16 @@ public HDT cat() throws IOException { @Override public void close() throws IOException { try { - IOUtil.closeAll(hdts); + try { + try { + profiler.stop(); + profiler.writeProfiling(); + } finally { + profiler.close(); + } + } finally { + IOUtil.closeAll(hdts); + } } finally { if (clearLocation) { location.close(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMerger.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMerger.java index d59983fd..035c3d17 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMerger.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMerger.java @@ -60,10 +60,10 @@ public class KCatMerger implements AutoCloseable { private final PipedCopyIterator subjectPipe = new PipedCopyIterator<>(); private final PipedCopyIterator objectPipe = new PipedCopyIterator<>(); private final PipedCopyIterator sharedPipe = new PipedCopyIterator<>(); - private final DuplicateBufferIterator sortedSubject; - private final DuplicateBufferIterator sortedObject; - private final DuplicateBufferIterator sortedPredicates; - private final Map> sortedSubSections; + private final ExceptionIterator sortedSubject; + private final ExceptionIterator sortedObject; + private final ExceptionIterator sortedPredicates; + private final Map> sortedSubSections; private final long estimatedSizeP; private final AtomicLong countTyped = new AtomicLong(); @@ -107,6 +107,7 @@ public KCatMerger(HDT[] hdts, CloseSuppressPath location, ProgressListener liste long sizeS = 0; long sizeP = 0; long sizeO = 0; + long sizeONoTyped = 0; long sizeShared = 0; Map subSections = new TreeMap<>(); @@ -118,6 +119,7 @@ public KCatMerger(HDT[] hdts, CloseSuppressPath location, ProgressListener liste sizeS += cat.countSubjects(); sizeP += cat.countPredicates(); sizeO += cat.countObjects(); + sizeONoTyped += cat.getObjectSection().getNumberOfElements(); sizeShared += cat.countShared(); long start = 1L + cat.countShared(); @@ -165,7 +167,8 @@ public KCatMerger(HDT[] hdts, CloseSuppressPath location, ProgressListener liste c.getSharedSection().getSortedEntries(), c.countShared() ) - ); + ).notif(sizeS, 20, "Merge subjects", listener); + sortedObject = mergeSection( cats, (hdtIndex, c) -> createMergeIt( @@ -174,20 +177,20 @@ public KCatMerger(HDT[] hdts, CloseSuppressPath location, ProgressListener liste c.getSharedSection().getSortedEntries(), c.objectShift() ) - ); + ).notif(sizeONoTyped, 20, "Merge objects", listener); // merge the other sections sortedPredicates = mergeSection(cats, (hdtIndex, c) -> { ExceptionIterator of = ExceptionIterator.of(c.getPredicateSection().getSortedEntries()); return of.map(((element, index) -> new LocatedIndexedNode(hdtIndex, index + 1, ByteString.of(element)))); - }); + }).notif(sizeP, 20, "Merge predicates", listener); sortedSubSections = new TreeMap<>(); // create a merge section for each section subSections.forEach((key, sections) -> sortedSubSections.put(key, mergeSection(sections, (hdtIndex, pre) -> { ExceptionIterator of = ExceptionIterator.of(pre.getSortedEntries()); return of.map(((element, index) -> new LocatedIndexedNode(hdtIndex, pre.getStart() + index, ByteString.of(element)))); - }))); + }).notif(Arrays.stream(sections).mapToLong(s -> s == null || s.section == null ? 0 : s.section.getNumberOfElements()).sum(), 20, "Merge typed objects", listener))); // convert the dupe buffer streams to byte string streams @@ -288,7 +291,7 @@ public static DuplicateBufferIterator mergeSection(T[] sec return mapper.apply(hdtIndex, e); }, LocatedIndexedNode::compareTo, - List.of(sections), + Arrays.asList(sections), 0, sections.length ), @@ -471,7 +474,7 @@ private void runSubSectionCompute() { ByteString key = e.getKey(); WriteDictionarySection section = e.getValue(); - DuplicateBufferIterator bufferIterator = sortedSubSections.get(key); + ExceptionIterator bufferIterator = sortedSubSections.get(key); final long currentShift = shift; section.load(new OneReadDictionarySection(bufferIterator.map((db, id) -> { @@ -505,7 +508,7 @@ public void close() throws IOException { throw new RuntimeException(e); } finally { Closer.of(sectionSubject, sectionPredicate, sectionObject, sectionShared) - .with(sectionSub.values()) + .with(sectionSub == null ? List.of() : sectionSub.values()) .with(subjectsMaps) .with(predicatesMaps) .with(objectsMaps) diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 86eefc5b..73f969ea 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -43,6 +43,7 @@ import java.util.Iterator; import java.util.List; import java.util.Optional; +import java.util.stream.Collectors; public class HDTManagerImpl extends HDTManager { private static final Logger logger = LoggerFactory.getLogger(HDTManagerImpl.class); @@ -383,9 +384,62 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStre } } + /** + * get the previous HDTs to merge with current + * + * @param nextFile if we can create a new HDT after this one + * @param files hdt files to merge + * @param current current created HDT + * @param maxFiles max file to merge + * @return list of HDT to merge with current, mi + */ + private List getNextHDTs(boolean nextFile, List files, HDTFile current, int maxFiles) { + if (files.isEmpty()) { + return List.of(); + } + List next = new ArrayList<>(); + if (nextFile || files.size() > maxFiles) { + for (int i = 1; i < maxFiles && i <= files.size(); i++) { + HDTFile old = files.get(files.size() - i); + + // check if the chunks are matching + if (nextFile && old.getChunks() > current.getChunks()) { + break; + } + + next.add(old); + } + if (!nextFile || next.size() == maxFiles - 1) { + // we have all the elements, or we have enough file + // we remove the elements from the files + for (int i = 0; i < next.size(); i++) { + files.remove(files.size() - 1); + } + } else { + return List.of(); + } + } else { + next.addAll(files); + files.clear(); + } + next.add(current); + return next; + } + @Override protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { Path basePath; + + long khdtCatOpt = hdtFormat.getInt(HDTOptionsKeys.LOADER_CATTREE_KCAT, 1); + + int kHDTCat; + + if (khdtCatOpt > 0 && khdtCatOpt < Integer.MAX_VALUE - 6) { + kHDTCat = (int) khdtCatOpt; + } else { + throw new IllegalArgumentException("Invalid kcat value: " + khdtCatOpt); + } + String baseNameOpt = hdtFormat.get(HDTOptionsKeys.LOADER_CATTREE_LOCATION_KEY); if (baseNameOpt == null || baseNameOpt.isEmpty()) { @@ -421,6 +475,8 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< profiler.pushSection("generateHDT #" + gen); PrefixListener il = PrefixListener.of("gen#" + gen, listener); Path hdtLocation = hdtStore.resolve("hdt-" + gen + ".hdt"); + // help memory flooding algorithm + System.gc(); supplier.doGenerateHDT(it, baseURI, hdtFormat, il, hdtLocation); il.clearThreads(); @@ -429,28 +485,61 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< profiler.popSection(); // merge the generated hdt with each block with enough size - while (!files.isEmpty() && (!nextFile || (files.get(files.size() - 1)).getChunks() <= hdtFile.getChunks())) { - HDTFile lastHDTFile = files.remove(files.size() - 1); - cat++; - profiler.pushSection("catHDT #" + cat); - PrefixListener ilc = PrefixListener.of("cat#" + cat, listener); - Path hdtCatFileLocation = hdtStore.resolve("hdtcat-" + cat + ".hdt"); - try (HDT abcat = HDTManager.catHDT( - hdtCatLocation, - lastHDTFile.getHdtFile().toAbsolutePath().toString(), - hdtFile.getHdtFile().toAbsolutePath().toString(), - hdtFormat, ilc)) { - abcat.saveToHDT(hdtCatFileLocation.toAbsolutePath().toString(), ilc); + if (kHDTCat == 1) { // default impl + while (!files.isEmpty() && (!nextFile || (files.get(files.size() - 1)).getChunks() <= hdtFile.getChunks())) { + HDTFile lastHDTFile = files.remove(files.size() - 1); + cat++; + profiler.pushSection("catHDT #" + cat); + PrefixListener ilc = PrefixListener.of("cat#" + cat, listener); + Path hdtCatFileLocation = hdtStore.resolve("hdtcat-" + cat + ".hdt"); + try (HDT abcat = HDTManager.catHDT( + hdtCatLocation, + lastHDTFile.getHdtFile().toAbsolutePath().toString(), + hdtFile.getHdtFile().toAbsolutePath().toString(), + hdtFormat, ilc)) { + abcat.saveToHDT(hdtCatFileLocation.toAbsolutePath().toString(), ilc); + } + ilc.clearThreads(); + // delete previous chunks + Files.delete(lastHDTFile.getHdtFile()); + Files.delete(hdtFile.getHdtFile()); + // note the new hdt file and the number of chunks + hdtFile = new HDTFile(hdtCatFileLocation, lastHDTFile.getChunks() + hdtFile.getChunks()); + + profiler.popSection(); + } + } else { // kcat + List nextHDTs; + + while (!(nextHDTs = getNextHDTs(nextFile, files, hdtFile, kHDTCat)).isEmpty()) { + // merge all the files + cat++; + profiler.pushSection("catHDT #" + cat); + PrefixListener ilc = PrefixListener.of("cat#" + cat, listener); + Path hdtCatFileLocation = hdtStore.resolve("hdtcat-" + cat + ".hdt"); + + assert nextHDTs.size() > 1; + + try (HDT abcat = HDTManager.catHDT( + nextHDTs.stream().map(f -> f.getHdtFile().toAbsolutePath().toString()).collect(Collectors.toList()), + hdtFormat, + ilc)) { + abcat.saveToHDT(hdtCatFileLocation.toAbsolutePath().toString(), ilc); + } + ilc.clearThreads(); + + // delete previous chunks + for (HDTFile nextHDT : nextHDTs) { + Files.delete(nextHDT.getHdtFile()); + } + // note the new hdt file and the number of chunks + long chunks = nextHDTs.stream().mapToLong(HDTFile::getChunks).sum(); + hdtFile = new HDTFile(hdtCatFileLocation, chunks); + + profiler.popSection(); } - ilc.clearThreads(); - // delete previous chunks - Files.delete(lastHDTFile.getHdtFile()); - Files.delete(hdtFile.getHdtFile()); - // note the new hdt file and the number of chunks - hdtFile = new HDTFile(hdtCatFileLocation, lastHDTFile.getChunks() + hdtFile.getChunks()); - - profiler.popSection(); } + assert nextFile || files.isEmpty() : "no data remaining, but contains files"; files.add(hdtFile); } while (nextFile); @@ -458,8 +547,9 @@ protected HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator< Path hdtFile = files.get(0).hdtFile; - assert files.get(0).getChunks() == gen; - assert cat < gen; + assert files.size() == 1 : "more than 1 file: " + files; + assert cat < gen : "more cat than gen"; + assert files.get(0).getChunks() == gen : "gen size isn't the same as excepted: " + files.get(0).getChunks() + " != " + gen; try { // if a future HDT location has been asked, move to it and map the HDT @@ -510,5 +600,13 @@ public long getChunks() { public Path getHdtFile() { return hdtFile; } + + @Override + public String toString() { + return "HDTFile{" + + "hdtFile=" + hdtFile + + ", chunks=" + chunks + + '}'; + } } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java index bf59bdfb..1a04b1bd 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java @@ -1,5 +1,7 @@ package org.rdfhdt.hdt.iterator.utils; +import org.rdfhdt.hdt.listener.ProgressListener; + import java.util.Iterator; import java.util.Objects; import java.util.function.Consumer; @@ -110,6 +112,25 @@ default ExceptionIterator map(MapExceptionIterator.MapWithIdFunction(this, mappingFunc); } + /** + * Convert to notification iterator + * + * @param estimatedSize the estimated size + * @param maxSplit the maximum split + * @param message message of the notification + * @param listener listener + * @return notification iterator + */ + default ExceptionIterator notif(long estimatedSize, long maxSplit, String message, ProgressListener listener) { + return new NotificationExceptionIterator<>( + this, + estimatedSize, + Math.max(maxSplit, estimatedSize / 10_000), + message, + listener + ); + } + /** * convert this exception iterator to a base iterator and convert the exception to RuntimeException * @return iterator diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMergerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMergerTest.java index d1be2e0e..67a9eace 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMergerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/kcat/KCatMergerTest.java @@ -6,7 +6,6 @@ import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import org.rdfhdt.hdt.compact.sequence.DynamicSequence; import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionaryPrivate; import org.rdfhdt.hdt.dictionary.DictionarySection; @@ -27,292 +26,286 @@ import org.rdfhdt.hdt.util.io.Closer; import org.rdfhdt.hdt.util.string.ByteString; -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; +import java.util.*; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.Stream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; +import static org.junit.Assert.*; @RunWith(Parameterized.class) public class KCatMergerTest extends AbstractMapMemoryTest { - @Parameterized.Parameters(name = "multi: {0}, unicode: {1}, map: {2}") - public static Collection params() { - return Stream.of(false, true).flatMap( - multi -> Stream.of(false, true).flatMap( - unicode -> Stream.of(false, true).map( - map -> new Object[]{multi, unicode, map} - ) - ) - ).collect(Collectors.toList()); - } - - @Parameterized.Parameter - public boolean multi; - @Parameterized.Parameter(1) - public boolean unicode; - @Parameterized.Parameter(2) - public boolean map; - - @Rule - public TemporaryFolder tempDir = new TemporaryFolder(); - - private void writeSection(DictionarySection sec, OutputStream stream) throws IOException { - ((DictionarySectionPrivate) sec).save(stream, null); - } - - private DictionarySection loadSection(InputStream stream) throws IOException { - PFCDictionarySection section = new PFCDictionarySection(new HDTOptionsBase()); - section.load(stream, null); - return section; - } - - private Map loadMultiSection(List seq, InputStream stream) throws IOException { - Map sectionMap = new TreeMap<>(); - for (CharSequence key : seq) { - PFCDictionarySection section = new PFCDictionarySection(new HDTOptionsBase()); - section.load(stream, null); - sectionMap.put(ByteString.of(key), section); - } - return sectionMap; - } - - @Test - public void mergerTest() throws ParserException, IOException, InterruptedException { - Path root = tempDir.getRoot().toPath(); - try { - int count = 10; - HDTOptions spec = new HDTOptionsBase(); - - if (multi) { - spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); - spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); - } - - // create "count" fake HDTs - LargeFakeDataSetStreamSupplier s = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxTriples(1000, 42) - .withMaxElementSplit(50) - .withUnicode(unicode); - - List hdts = new ArrayList<>(); - for (int i = 0; i < count; i++) { - String location = root.resolve("hdt" + i + ".hdt").toAbsolutePath().toString(); - hdts.add(location); - s.createAndSaveFakeHDT(spec, location); - } - - // create the excepted HDT from previous algorithm - Path fatcathdt = root.resolve("fatcat.hdt"); - LargeFakeDataSetStreamSupplier - .createSupplierWithMaxTriples(1000 * count, 42) - .withMaxElementSplit(50) - .withUnicode(unicode) - .createAndSaveFakeHDT(spec, fatcathdt.toAbsolutePath().toString()); - - // create dictionary and write sections - Path dictFile = root.resolve("dict"); - List sub = new ArrayList<>(); - try (KCatImpl impl = new KCatImpl(hdts, spec, null)) { - try (KCatMerger merger = impl.createMerger(null)) { - assertEquals(multi, merger.typedHDT); - merger.startMerger(); - // create - DictionaryPrivate dict = merger.buildDictionary(); - try (OutputStream stream = new BufferedOutputStream(Files.newOutputStream(dictFile))) { - writeSection(dict.getShared(), stream); - writeSection(dict.getSubjects(), stream); - writeSection(dict.getPredicates(), stream); - if (multi) { - for (Map.Entry e : dict.getAllObjects().entrySet()) { - CharSequence key = e.getKey(); - sub.add(key); - DictionarySection sec = e.getValue(); - writeSection(sec, stream); - } - } else { - writeSection(dict.getObjects(), stream); - } - } - - // check if all the dynamic sequences are filled - - SyncSeq[] sms = merger.subjectsMaps; - SyncSeq[] pms = merger.predicatesMaps; - SyncSeq[] oms = merger.objectsMaps; - - AtomicLong[] objectCounts = merger.countObject; - AtomicLong[] subjectCounts = merger.countSubject; - - for (int hdtId = 1; hdtId <= impl.hdts.length; hdtId++) { - HDT hdt = impl.hdts[hdtId - 1]; - SyncSeq sm = sms[hdtId - 1]; - SyncSeq pm = pms[hdtId - 1]; - SyncSeq om = oms[hdtId - 1]; - - AtomicLong objectCount = objectCounts[hdtId - 1]; - AtomicLong subjectCount = subjectCounts[hdtId - 1]; - - long shared = hdt.getDictionary().getShared().getNumberOfElements(); - long subjects = hdt.getDictionary().getSubjects().getNumberOfElements(); - long predicates = hdt.getDictionary().getPredicates().getNumberOfElements(); - long objects = - multi ? hdt.getDictionary().getAllObjects().values().stream().mapToLong(DictionarySection::getNumberOfElements).sum() - : hdt.getDictionary().getObjects().getNumberOfElements(); - - assertEquals(shared + objects, objectCount.get()); - assertEquals(shared + subjects, subjectCount.get()); - - for (long i = 1; i <= shared; i++) { - long sv = sm.get(i); - long ov = om.get(i); - if (merger.removeHeader(sv) == 0) { - fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing shared subject #" + i + "/" + shared + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.SUBJECT)); - } - if (merger.removeHeader(ov) == 0) { - fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing shared object #" + i + "/" + shared + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.OBJECT)); - } - - assertEquals("shared element not mapped to the same object", ov, sv); - assertTrue("shared mapped element isn't shared", merger.isShared(ov)); - } - - for (long i = 1; i <= subjects; i++) { - if (merger.removeHeader(sm.get(shared + i)) == 0) { - fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing subject #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i + shared, TripleComponentRole.SUBJECT)); - } - } - - for (long i = 1; i <= objects; i++) { - if (merger.removeHeader(om.get(shared + i)) == 0) { - fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing object #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i + shared, TripleComponentRole.OBJECT)); - } - } - - for (long i = 1; i <= predicates; i++) { - if (pm.get(i) == 0) { - fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing predicate #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.PREDICATE)); - } - } - - } - } - } - try (InputStream stream = new BufferedInputStream(Files.newInputStream(dictFile))) { - // read the sections - try (DictionarySection sh = loadSection(stream); - DictionarySection su = loadSection(stream); - DictionarySection pr = loadSection(stream)) { - Map dictionarySectionMap; - DictionarySection ob; - if (multi) { - ob = null; - dictionarySectionMap = loadMultiSection(sub, stream); - } else { - dictionarySectionMap = Map.of(); - ob = loadSection(stream); - } - try { - // map the excepted hdt - try (HDT exceptedHDT = HDTManager.mapHDT(fatcathdt.toAbsolutePath().toString())) { - Dictionary exceptedDict = exceptedHDT.getDictionary(); - assertNotEquals("Invalid test, shared section empty", 0, exceptedHDT.getDictionary().getShared().getNumberOfElements()); - // assert equals between the dictionaries - HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Shared", exceptedDict.getShared(), sh); - HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Subjects", exceptedDict.getSubjects(), su); - HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Predicates", exceptedDict.getPredicates(), pr); - if (multi) { - Map exceptedDictSub = exceptedDict.getAllObjects(); - dictionarySectionMap.forEach((key, sec) -> { - DictionarySection subSec = exceptedDictSub.get(key); - assertNotNull("sub#" + key + " wasn't found", subSec); - HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Section#" + key, subSec, sec); - }); - } else { - assert ob != null; - HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Objects", exceptedDict.getObjects(), ob); - } - } - } finally { - Closer - .of(ob) - .with(dictionarySectionMap.values()) - .close(); - } - } - } - } finally { - PathUtils.deleteDirectory(root); - } - } - - @Test - public void catTest() throws ParserException, IOException, NotFoundException { - Path root = tempDir.getRoot().toPath(); - try { - // number of HDTs - int count = 10; - long countPerHDT = 1000; - - // create the config - HDTOptions spec = new HDTOptionsBase(); - if (multi) { - spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); - spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); - } - - if (map) { - spec.set(HDTOptionsKeys.HDTCAT_FUTURE_LOCATION, root.resolve("futurehc.hdt").toAbsolutePath()); - } - - // create "count" fake HDTs - LargeFakeDataSetStreamSupplier s = LargeFakeDataSetStreamSupplier - .createSupplierWithMaxTriples(countPerHDT, 42) - .withMaxElementSplit(50) - .withUnicode(unicode); - - List hdts = new ArrayList<>(); - for (int i = 0; i < count; i++) { - String location = root.resolve("hdt" + i + ".hdt").toAbsolutePath().toString(); - hdts.add(location); - s.createAndSaveFakeHDT(spec, location); - } - - // create the excepted HDT from previous algorithm - Path fatcathdt = root.resolve("fatcat.hdt"); - LargeFakeDataSetStreamSupplier - .createSupplierWithMaxTriples(countPerHDT * count, 42) - .withMaxElementSplit(50) - .withUnicode(unicode) - .createAndSaveFakeHDT(spec, fatcathdt.toAbsolutePath().toString()); - - // create dictionary and write sections - // map the excepted hdt - try (HDT actualHDT = HDTManager.catHDT(hdts, spec, null)) { - try (HDT exceptedHDT = HDTManager.mapHDT(fatcathdt.toAbsolutePath().toString())) { - // assert equals between the dictionaries - assertNotEquals(0, actualHDT.getDictionary().getShared().getNumberOfElements()); - HDTManagerTest.HDTManagerTestBase.assertEqualsHDT(exceptedHDT, actualHDT); - } - } - } finally { - PathUtils.deleteDirectory(root); - } - } + @Parameterized.Parameters(name = "multi: {0}, unicode: {1}, map: {2}, count: {3}") + public static Collection params() { + return Stream.of(false, true).flatMap( + multi -> Stream.of(false, true).flatMap( + unicode -> Stream.of(false, true).flatMap( + map -> Stream.of(2, 10).map( + kcat -> new Object[]{multi, unicode, map, kcat} + ) + ) + ) + ).collect(Collectors.toList()); + } + + @Parameterized.Parameter + public boolean multi; + @Parameterized.Parameter(1) + public boolean unicode; + @Parameterized.Parameter(2) + public boolean map; + @Parameterized.Parameter(3) + public int kcat; + + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + private void writeSection(DictionarySection sec, OutputStream stream) throws IOException { + ((DictionarySectionPrivate) sec).save(stream, null); + } + + private DictionarySection loadSection(InputStream stream) throws IOException { + PFCDictionarySection section = new PFCDictionarySection(new HDTOptionsBase()); + section.load(stream, null); + return section; + } + + private Map loadMultiSection(List seq, InputStream stream) throws IOException { + Map sectionMap = new TreeMap<>(); + for (CharSequence key : seq) { + PFCDictionarySection section = new PFCDictionarySection(new HDTOptionsBase()); + section.load(stream, null); + sectionMap.put(ByteString.of(key), section); + } + return sectionMap; + } + + @Test + public void mergerTest() throws ParserException, IOException, InterruptedException { + Path root = tempDir.getRoot().toPath(); + try { + HDTOptions spec = new HDTOptionsBase(); + + if (multi) { + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); + } + + // create "kcat" fake HDTs + LargeFakeDataSetStreamSupplier s = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(1000, 42) + .withMaxElementSplit(50) + .withUnicode(unicode); + + List hdts = new ArrayList<>(); + for (int i = 0; i < kcat; i++) { + String location = root.resolve("hdt" + i + ".hdt").toAbsolutePath().toString(); + hdts.add(location); + s.createAndSaveFakeHDT(spec, location); + } + + // create the excepted HDT from previous algorithm + Path fatcathdt = root.resolve("fatcat.hdt"); + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxTriples(1000L * kcat, 42) + .withMaxElementSplit(50) + .withUnicode(unicode) + .createAndSaveFakeHDT(spec, fatcathdt.toAbsolutePath().toString()); + + // create dictionary and write sections + Path dictFile = root.resolve("dict"); + List sub = new ArrayList<>(); + try (KCatImpl impl = new KCatImpl(hdts, spec, null)) { + try (KCatMerger merger = impl.createMerger(null)) { + assertEquals(multi, merger.typedHDT); + merger.startMerger(); + // create + DictionaryPrivate dict = merger.buildDictionary(); + try (OutputStream stream = new BufferedOutputStream(Files.newOutputStream(dictFile))) { + writeSection(dict.getShared(), stream); + writeSection(dict.getSubjects(), stream); + writeSection(dict.getPredicates(), stream); + if (multi) { + for (Map.Entry e : dict.getAllObjects().entrySet()) { + CharSequence key = e.getKey(); + sub.add(key); + DictionarySection sec = e.getValue(); + writeSection(sec, stream); + } + } else { + writeSection(dict.getObjects(), stream); + } + } + + // check if all the dynamic sequences are filled + + SyncSeq[] sms = merger.subjectsMaps; + SyncSeq[] pms = merger.predicatesMaps; + SyncSeq[] oms = merger.objectsMaps; + + AtomicLong[] objectCounts = merger.countObject; + AtomicLong[] subjectCounts = merger.countSubject; + + for (int hdtId = 1; hdtId <= impl.hdts.length; hdtId++) { + HDT hdt = impl.hdts[hdtId - 1]; + SyncSeq sm = sms[hdtId - 1]; + SyncSeq pm = pms[hdtId - 1]; + SyncSeq om = oms[hdtId - 1]; + + AtomicLong objectCount = objectCounts[hdtId - 1]; + AtomicLong subjectCount = subjectCounts[hdtId - 1]; + + long shared = hdt.getDictionary().getShared().getNumberOfElements(); + long subjects = hdt.getDictionary().getSubjects().getNumberOfElements(); + long predicates = hdt.getDictionary().getPredicates().getNumberOfElements(); + long objects = + multi ? hdt.getDictionary().getAllObjects().values().stream().mapToLong(DictionarySection::getNumberOfElements).sum() + : hdt.getDictionary().getObjects().getNumberOfElements(); + + assertEquals(shared + objects, objectCount.get()); + assertEquals(shared + subjects, subjectCount.get()); + + for (long i = 1; i <= shared; i++) { + long sv = sm.get(i); + long ov = om.get(i); + if (merger.removeHeader(sv) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing shared subject #" + i + "/" + shared + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.SUBJECT)); + } + if (merger.removeHeader(ov) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing shared object #" + i + "/" + shared + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.OBJECT)); + } + + assertEquals("shared element not mapped to the same object", ov, sv); + assertTrue("shared mapped element isn't shared", merger.isShared(ov)); + } + + for (long i = 1; i <= subjects; i++) { + if (merger.removeHeader(sm.get(shared + i)) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing subject #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i + shared, TripleComponentRole.SUBJECT)); + } + } + + for (long i = 1; i <= objects; i++) { + if (merger.removeHeader(om.get(shared + i)) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing object #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i + shared, TripleComponentRole.OBJECT)); + } + } + + for (long i = 1; i <= predicates; i++) { + if (pm.get(i) == 0) { + fail("HDT #" + hdtId + "/" + impl.hdts.length + " Missing predicate #" + i + "/" + subjects + " for node: " + hdt.getDictionary().idToString(i, TripleComponentRole.PREDICATE)); + } + } + + } + } + } + try (InputStream stream = new BufferedInputStream(Files.newInputStream(dictFile))) { + // read the sections + try (DictionarySection sh = loadSection(stream); + DictionarySection su = loadSection(stream); + DictionarySection pr = loadSection(stream)) { + Map dictionarySectionMap; + DictionarySection ob; + if (multi) { + ob = null; + dictionarySectionMap = loadMultiSection(sub, stream); + } else { + dictionarySectionMap = Map.of(); + ob = loadSection(stream); + } + try { + // map the excepted hdt + try (HDT exceptedHDT = HDTManager.mapHDT(fatcathdt.toAbsolutePath().toString())) { + Dictionary exceptedDict = exceptedHDT.getDictionary(); + assertNotEquals("Invalid test, shared section empty", 0, exceptedHDT.getDictionary().getShared().getNumberOfElements()); + // assert equals between the dictionaries + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Shared", exceptedDict.getShared(), sh); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Subjects", exceptedDict.getSubjects(), su); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Predicates", exceptedDict.getPredicates(), pr); + if (multi) { + Map exceptedDictSub = exceptedDict.getAllObjects(); + dictionarySectionMap.forEach((key, sec) -> { + DictionarySection subSec = exceptedDictSub.get(key); + assertNotNull("sub#" + key + " wasn't found", subSec); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Section#" + key, subSec, sec); + }); + } else { + assert ob != null; + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT("Objects", exceptedDict.getObjects(), ob); + } + } + } finally { + Closer + .of(ob) + .with(dictionarySectionMap.values()) + .close(); + } + } + } + } finally { + PathUtils.deleteDirectory(root); + } + } + + @Test + public void catTest() throws ParserException, IOException, NotFoundException { + Path root = tempDir.getRoot().toPath(); + try { + // number of HDTs + int countPerHDT = 1000; + Random rnd = new Random(58); + + // create the config + HDTOptions spec = new HDTOptionsBase(); + if (multi) { + spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, HDTOptionsKeys.DICTIONARY_TYPE_VALUE_MULTI_OBJECTS); + spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, HDTOptionsKeys.TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH); + } + + if (map) { + spec.set(HDTOptionsKeys.HDTCAT_FUTURE_LOCATION, root.resolve("futurehc.hdt").toAbsolutePath()); + } + + // create "kcat" fake HDTs + LargeFakeDataSetStreamSupplier s = LargeFakeDataSetStreamSupplier + .createInfinite(42) + .withMaxElementSplit(50) + .withUnicode(unicode); + + long size = 0; + List hdts = new ArrayList<>(); + for (int i = 0; i < kcat; i++) { + String location = root.resolve("hdt" + i + ".hdt").toAbsolutePath().toString(); + hdts.add(location); + int hdtSize = countPerHDT / 2 + rnd.nextInt(countPerHDT); + size += hdtSize; + s.withMaxTriples(hdtSize) + .createAndSaveFakeHDT(spec, location); + } + + // create the excepted HDT from previous algorithm + Path fatcathdt = root.resolve("fatcat.hdt"); + s.reset(); + s.withMaxTriples(size) + .createAndSaveFakeHDT(spec, fatcathdt.toAbsolutePath().toString()); + + // create dictionary and write sections + // map the excepted hdt + try (HDT actualHDT = HDTManager.catHDT(hdts, spec, null)) { + try (HDT exceptedHDT = HDTManager.mapHDT(fatcathdt.toAbsolutePath().toString())) { + // assert equals between the dictionaries + assertNotEquals(0, actualHDT.getDictionary().getShared().getNumberOfElements()); + HDTManagerTest.HDTManagerTestBase.assertEqualsHDT(exceptedHDT, actualHDT); + } + } + } finally { + PathUtils.deleteDirectory(root); + } + } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java index 614d77fc..d483444c 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -15,11 +15,9 @@ import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.impl.MultipleBaseDictionary; -import org.rdfhdt.hdt.dictionary.impl.kcat.LocatedIndexedNode; import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotFoundException; -import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; @@ -30,7 +28,6 @@ import org.rdfhdt.hdt.rdf.RDFFluxStop; import org.rdfhdt.hdt.rdf.RDFParserFactory; import org.rdfhdt.hdt.triples.IteratorTripleID; -import org.rdfhdt.hdt.triples.IteratorTripleString; import org.rdfhdt.hdt.triples.TripleID; import org.rdfhdt.hdt.triples.TripleString; import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; @@ -47,7 +44,6 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -572,14 +568,16 @@ public void catTreeDiskTest() throws IOException, ParserException, NotFoundExcep @RunWith(Parameterized.class) public static class DynamicCatTreeTest extends HDTManagerTestBase { - @Parameterized.Parameters(name = "{5} - {0}") + @Parameterized.Parameters(name = "{5} - {0} kcat: {8}") public static Collection params() { List params = new ArrayList<>(); for (String[] dict : diskDict()) { - params.add(new Object[]{"base", SIZE_VALUE * 16, 20, 50, false, dict[0], dict[1], SIZE_VALUE}); - params.add(new Object[]{"duplicates", SIZE_VALUE * 16, 10, 50, false, dict[0], dict[1], SIZE_VALUE}); - params.add(new Object[]{"large-literals", SIZE_VALUE * 4, 20, 250, false, dict[0], dict[1], SIZE_VALUE}); - params.add(new Object[]{"quiet", SIZE_VALUE * 16, 10, 50, false, dict[0], dict[1], SIZE_VALUE}); + for (long kcat : new long[]{2, 10, 0}) { + params.add(new Object[]{"base", SIZE_VALUE * 16, 20, 50, false, dict[0], dict[1], SIZE_VALUE, kcat}); + params.add(new Object[]{"duplicates", SIZE_VALUE * 16, 10, 50, false, dict[0], dict[1], SIZE_VALUE, kcat}); + params.add(new Object[]{"large-literals", SIZE_VALUE * 4, 20, 250, false, dict[0], dict[1], SIZE_VALUE, kcat}); + params.add(new Object[]{"quiet", SIZE_VALUE * 16, 10, 50, false, dict[0], dict[1], SIZE_VALUE, kcat}); + } } return params; } @@ -600,11 +598,19 @@ public static Collection params() { public String tempDictionaryType; @Parameterized.Parameter(7) public long size; + @Parameterized.Parameter(8) + public long kCat; + @Before public void setupSpecs() { spec.set(HDTOptionsKeys.DICTIONARY_TYPE_KEY, dictionaryType); spec.set(HDTOptionsKeys.TEMP_DICTIONARY_IMPL_KEY, tempDictionaryType); + spec.set(HDTOptionsKeys.PROFILER_KEY, true); + + if (kCat != 0) { + spec.set(HDTOptionsKeys.LOADER_CATTREE_KCAT, kCat); + } } @Test @@ -995,11 +1001,6 @@ public void bigDiskTest() throws ParserException, IOException { } } - @Test - public void zqdz() { - System.out.println("\255".getBytes(StandardCharsets.UTF_8)[0] & 0xFF); - } - @Test public void bigCatTreeDiskTest() throws ParserException, IOException { HDTOptions spec = new HDTSpecification(); diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index b67c1ff4..2fda34bd 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -106,15 +106,24 @@ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long m return new LargeFakeDataSetStreamSupplier(Long.MAX_VALUE, maxTriples, seed); } + /** + * create a supplier without a max count + * + * @param seed the seed of the supplier, the same seed will create the same supplier + * @return supplier + */ + public static LargeFakeDataSetStreamSupplier createInfinite(long seed) { + return new LargeFakeDataSetStreamSupplier(Long.MAX_VALUE, Long.MAX_VALUE, seed); + } + private final long seed; private Random random; - private final long maxSize; - private final long maxTriples; + private long maxSize; + private long maxTriples; public int maxFakeType = 10; public int maxLiteralSize = 2; public int maxElementSplit = Integer.MAX_VALUE; private long slowStream; - private boolean sameTripleString; private boolean unicode; private TripleString buffer; private TripleString next; @@ -212,9 +221,10 @@ public ThreadedStream createNTInputStream(CompressionType compressionType) throw out = pout; } + Iterator it = createTripleStringStream(); + ExceptionThread run = new ExceptionThread(() -> { try (PrintStream ps = new PrintStream(out, true)) { - Iterator it = createTripleStringStream(); while (it.hasNext()) { it.next().dumpNtriple(ps); } @@ -328,7 +338,12 @@ private class FakeStatementIterator implements Iterator { private long count = 0; private boolean init; + private final long maxTriples; + private final long maxSize; + FakeStatementIterator() { + this.maxSize = LargeFakeDataSetStreamSupplier.this.maxSize; + this.maxTriples = LargeFakeDataSetStreamSupplier.this.maxTriples; } @Override @@ -393,6 +408,26 @@ public TripleString next() { } } + /** + * set the max size + * @param maxSize max size + * @return this + */ + public LargeFakeDataSetStreamSupplier withMaxSize(long maxSize) { + this.maxSize = maxSize; + return this; + } + + /** + * set the max triples count + * @param maxTriples max triples count + * @return this + */ + public LargeFakeDataSetStreamSupplier withMaxTriples(long maxTriples) { + this.maxTriples = maxTriples; + return this; + } + /** * set the maximum number of fake type * @@ -456,7 +491,6 @@ public LargeFakeDataSetStreamSupplier withSlowStream(long slowStream) { * @return this */ public LargeFakeDataSetStreamSupplier withSameTripleString(boolean sameTripleString) { - this.sameTripleString = sameTripleString; if (sameTripleString) { buffer = new TripleString(); } else { diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java index 7df76445..9f3dcf3f 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplierTest.java @@ -9,6 +9,7 @@ import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.hdt.HDTManagerTest; +import org.rdfhdt.hdt.iterator.utils.CombinedIterator; import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.rdf.RDFParserCallback; @@ -22,7 +23,10 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; import java.util.Iterator; +import java.util.stream.Collectors; +import java.util.stream.LongStream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -58,6 +62,26 @@ public void streamTest() throws IOException { } } + @Test + public void sameTest() { + long[] sizes = {50, 25, 32, 10, 0, 12}; + LargeFakeDataSetStreamSupplier s1 = LargeFakeDataSetStreamSupplier.createInfinite(34); + LargeFakeDataSetStreamSupplier s2 = LargeFakeDataSetStreamSupplier.createInfinite(34); + + Iterator it1 = CombinedIterator.combine( + LongStream.of(sizes) + .mapToObj(s -> s1.withMaxTriples(s).createTripleStringStream()) + .collect(Collectors.toList()) + ); + + Iterator it2 = s2.withMaxTriples(LongStream.of(sizes).sum()).createTripleStringStream(); + + while (it2.hasNext()) { + assertTrue(it1.hasNext()); + assertEquals(it2.next(), it1.next()); + } + assertFalse(it1.hasNext()); + } @Test public void countTest() { long size = 42; @@ -70,6 +94,81 @@ public void countTest() { } assertEquals(size, count); } + @Test + public void countTest2() { + long size = 42; + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(size, 34); + { + Iterator it = supplier + .createTripleStringStream(); + int count = 0; + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(size, count); + } + { + Iterator it = supplier + .createTripleStringStream(); + int count = 0; + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(size, count); + } + { + Iterator it = supplier + .createTripleStringStream(); + int count = 0; + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(size, count); + } + } + @Test + public void countTest3() { + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier.createInfinite(34); + { + long size = 42; + Iterator it = supplier + .withMaxTriples(size) + .createTripleStringStream(); + int count = 0; + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(size, count); + } + { + long size = 24; + Iterator it = supplier + .withMaxTriples(size) + .createTripleStringStream(); + int count = 0; + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(size, count); + } + { + long size = 35; + Iterator it = supplier + .withMaxTriples(size) + .createTripleStringStream(); + int count = 0; + while (it.hasNext()) { + it.next(); + count++; + } + assertEquals(size, count); + } + } @Test public void mergeTest() throws IOException, ParserException, NotFoundException { diff --git a/hdt-java-package/bin/hdtCat.bat b/hdt-java-package/bin/hdtCat.bat new file mode 100644 index 00000000..b2b8d90b --- /dev/null +++ b/hdt-java-package/bin/hdtCat.bat @@ -0,0 +1,5 @@ +@echo off + +call "%~dp0\javaenv.bat" + +"%JAVACMD%" %JAVAOPTIONS% -classpath %~dp0\..\lib\* org.rdfhdt.hdt.tools.HDTCat %*