Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDTCatTree + HDTGenDisk #179

Merged
merged 9 commits into from
Nov 21, 2022
Prev Previous commit
Next Next commit
Remove String usage of GenDisk, ignore Unicode test, add hdtVerify.ba…
…t for Windows, fix string order and better logs for rdf2hdt
  • Loading branch information
ate47 committed Nov 16, 2022
commit d1c474b272e616559b73599e217413813bf53f9f
41 changes: 20 additions & 21 deletions hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
Expand All @@ -28,13 +27,13 @@ public class HDTOptionsKeys {
* Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower
* but decrease the RAM usage. default config.
*/
@Value(value = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort all the file before going to the next step, slower but decrease the RAM usage. default config")
@Value(key = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort all the file before going to the next step, slower but decrease the RAM usage. default config")
public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete";
/**
* Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next
* step, faster but increase the RAM usage.
*/
@Value(value = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort while reading all the file before going to the next step, faster but increase the RAM usage.")
@Value(key = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort while reading all the file before going to the next step, faster but increase the RAM usage.")
public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial";

/**
Expand Down Expand Up @@ -98,22 +97,22 @@ public class HDTOptionsKeys {
/**
* Value for {@link #LOADER_TYPE_KEY}, read using disk generation, reduce the RAM usage and increase disk usage
*/
@Value(value = LOADER_TYPE_KEY, desc = "Using genDisk")
@Value(key = LOADER_TYPE_KEY, desc = "Using genDisk")
public static final String LOADER_TYPE_VALUE_DISK = "disk";
/**
* Value for {@link #LOADER_TYPE_KEY}, read using HDTCat generation, merge using HDTCat HDT, reduce the RAM usage
*/
@Value(value = LOADER_TYPE_KEY, desc = "Using HDTCat")
@Value(key = LOADER_TYPE_KEY, desc = "Using HDTCat")
public static final String LOADER_TYPE_VALUE_CAT = "cat";
/**
* Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage
*/
@Value(value = LOADER_TYPE_KEY, desc = "Using two pass algorithm")
@Value(key = LOADER_TYPE_KEY, desc = "Using two pass algorithm")
public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass";
/**
* Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value
*/
@Value(value = LOADER_TYPE_KEY, desc = "Using one pass algorithm")
@Value(key = LOADER_TYPE_KEY, desc = "Using one pass algorithm")
public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass";

/**
Expand Down Expand Up @@ -150,12 +149,12 @@ public class HDTOptionsKeys {
/**
* Value for {@link #HDT_SUPPLIER_KEY}, use HDTGenDisk to create the HDT
*/
@Value(value = HDT_SUPPLIER_KEY, desc = "using genDisk")
@Value(key = HDT_SUPPLIER_KEY, desc = "using genDisk")
public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_DISK = "disk";
/**
* Value for {@link #HDT_SUPPLIER_KEY}, use the default memory implementation to create the HDT
*/
@Value(value = HDT_SUPPLIER_KEY, desc = "using gen in memory")
@Value(key = HDT_SUPPLIER_KEY, desc = "using gen in memory")
public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_MEMORY = "memory";
/**
* Key for the rdf flux stop type, default to the maximum memory allocated
Expand Down Expand Up @@ -217,12 +216,12 @@ public class HDTOptionsKeys {
/**
* load the HDT file into memory
*/
@Value(value = LOAD_HDT_TYPE_KEY, desc = "load the HDTs in memory")
@Value(key = LOAD_HDT_TYPE_KEY, desc = "load the HDTs in memory")
public static final String LOAD_HDT_TYPE_VALUE_LOAD = "load";
/**
* map the HDT file, default value
*/
@Value(value = LOAD_HDT_TYPE_KEY, desc = "map the HDTs")
@Value(key = LOAD_HDT_TYPE_KEY, desc = "map the HDTs")
public static final String LOAD_HDT_TYPE_VALUE_MAP = "map";

/**
Expand All @@ -233,17 +232,17 @@ public class HDTOptionsKeys {
/**
* use Hash map to create the HDT
*/
@Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary")
@Value(key = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary")
public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH = "hash";
/**
* use Hash map to create the HDT and store the multisection dictionary, mandatory to create MSC
*/
@Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary with literal count")
@Value(key = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary with literal count")
public static final String TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH = "multHash";
/**
* use Hash map with Prefix AND Suffix front-coded (PSFC), mandatory to create PSFC dictionary
*/
@Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "Prefix AND Suffix front-coded (PSFC) hash dictionary")
@Value(key = TEMP_DICTIONARY_IMPL_KEY, desc = "Prefix AND Suffix front-coded (PSFC) hash dictionary")
public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC = "hashPsfc";

/**
Expand All @@ -254,22 +253,22 @@ public class HDTOptionsKeys {
/**
* 4 Section dictionary
*/
@Value(value = DICTIONARY_TYPE_KEY, desc = "Four sectiob dictionary")
@Value(key = DICTIONARY_TYPE_KEY, desc = "Four sectiob dictionary")
public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION;
/**
* Prefix AND Suffix front-coded (PSFC) 4 Section dictionary
*/
@Value(value = DICTIONARY_TYPE_KEY, desc = "Prefix AND Suffix front-coded (PSFC) four section dictionary")
@Value(key = DICTIONARY_TYPE_KEY, desc = "Prefix AND Suffix front-coded (PSFC) four section dictionary")
public static final String DICTIONARY_TYPE_VALUE_FOUR_PSFC_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION;
/**
* big 4 Section dictionary
*/
@Value(value = DICTIONARY_TYPE_KEY, desc = "Four section dictionary big")
@Value(key = DICTIONARY_TYPE_KEY, desc = "Four section dictionary big")
public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG = "dictionaryFourBig";
/**
* multi section dictionary
*/
@Value(value = DICTIONARY_TYPE_KEY, desc = "Multi section dictionary")
@Value(key = DICTIONARY_TYPE_KEY, desc = "Multi section dictionary")
public static final String DICTIONARY_TYPE_VALUE_MULTI_OBJECTS = "dictionaryMultiObj";

// use tree-map to have a better order
Expand All @@ -287,7 +286,7 @@ public class HDTOptionsKeys {
Value value = f.getAnnotation(Value.class);
if (value != null) {
String valueValue = (String) f.get(null);
Option opt = OPTION_MAP.get(value.value());
Option opt = OPTION_MAP.get(value.key());
if (opt != null) {
opt.values.add(new OptionValue(valueValue, value));
}
Expand All @@ -310,7 +309,7 @@ public static class OptionValue {
private final String value;
private final Value valueInfo;

public OptionValue(String value, Value valueInfo) {
private OptionValue(String value, Value valueInfo) {
this.value = value;
this.valueInfo = valueInfo;
}
Expand All @@ -329,7 +328,7 @@ public static class Option {
private final Key keyInfo;
private final List<OptionValue> values = new ArrayList<>();

public Option(String key, Key keyInfo) {
private Option(String key, Key keyInfo) {
this.key = key;
this.keyInfo = keyInfo;
}
Expand Down
38 changes: 26 additions & 12 deletions hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,37 @@
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;

/**
* define a key in the HDTOptionsKey class
*
* @author Antoine Willerval
*/
@Retention(RetentionPolicy.RUNTIME)
public @interface Key {
enum Type {
STRING("String"), PATH("Path"), NUMBER("Number"), DOUBLE("Double"), BOOLEAN("Boolean"), ENUM("Enum");
/**
* Type enum for a key
*/
enum Type {
STRING("String"), PATH("Path"), NUMBER("Number"), DOUBLE("Double"), BOOLEAN("Boolean"), ENUM("Enum");

private final String title;
private final String title;

Type(String title) {
this.title = title;
}
Type(String title) {
this.title = title;
}

public String getTitle() {
return title;
}
}
public String getTitle() {
return title;
}
}

String desc() default "";
/**
* @return description of the key
*/
String desc() default "";

Type type() default Type.STRING;
/**
* @return type of the key
*/
Type type() default Type.STRING;
}
16 changes: 14 additions & 2 deletions hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,20 @@
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;

/**
* Describe the value of a {@link Key} of type {@link Key.Type#ENUM}
*
* @author Antoine Willerval
*/
@Retention(RetentionPolicy.RUNTIME)
public @interface Value {
String value();
String desc() default "";
/**
* @return the key
*/
String key();

/**
* @return description of the value
*/
String desc() default "";
}
126 changes: 66 additions & 60 deletions hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java
Original file line number Diff line number Diff line change
@@ -1,79 +1,85 @@
package org.rdfhdt.hdt.tools;

import java.util.Comparator;
import java.util.Iterator;

import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.util.string.ByteString;
import org.rdfhdt.hdt.util.string.CharSequenceComparator;
import org.rdfhdt.hdt.util.string.CompactString;
import org.rdfhdt.hdt.util.string.ReplazableString;

import java.util.Comparator;
import java.util.Iterator;

public class HDTVerify {

private static final Comparator<CharSequence> comparator = CharSequenceComparator.getInstance();

private HDTVerify() {}
private HDTVerify() {
}

private static void print(byte[] arr) {
for (byte b : arr) {
System.out.printf("%02X ", b);
}
System.out.println();
}

private static void print(CharSequence seq) {
if (seq instanceof CompactString) {
CompactString cs1 = (CompactString) seq;
print(cs1.getData());
}

private static void print(byte[] arr) {
for (byte b : arr) {
System.out.printf("%02X ", b);
}
System.out.println();
}
if (seq instanceof String) {
String rs1 = (String) seq;
print(rs1.getBytes());
}
}

private static void print(CharSequence seq) {
if(seq instanceof CompactString) {
CompactString cs1 = (CompactString) seq;
print(cs1.getData());
}
public static void checkDictionarySectionOrder(Iterator<? extends CharSequence> it) {
ReplazableString prev = new ReplazableString();
String lastStr = "";
while (it.hasNext()) {
ByteString charSeq = ByteString.of(it.next());
String str = charSeq.toString();

if(seq instanceof String) {
String rs1 = (String) seq;
print(rs1.getBytes());
}
}
int cmp = prev.compareTo(charSeq);

public static void checkDictionarySectionOrder(Iterator<? extends CharSequence> it) {
CharSequence lastCharseq = null;
String lastStr =null;
int cmp=0, cmp2=0;
while (it.hasNext()) {
CharSequence charSeq = it.next();
String str = charSeq.toString();
if (cmp >= 0) {
System.out.println("ERRA: " + prev + " / " + charSeq);
}

if(lastCharseq!=null && ((cmp=comparator.compare(lastCharseq, charSeq))>=0 )) {
System.out.println("ERRA: "+lastCharseq+" / "+charSeq);
}
int cmp2 = lastStr.compareTo(str);

if(lastStr!=null && ((cmp2=lastStr.compareTo(str))>=0)) {
System.out.println("ERRB: "+lastStr+" / "+str);
}
if (cmp2 >= 0) {
System.out.println("ERRB: " + lastStr + " / " + str);
}

if(Math.signum(cmp)!=Math.signum(cmp2)) {
System.out.println("Not equal: "+cmp+" / "+cmp2);
print(lastCharseq); print(charSeq);
print(lastStr); print(str);
}
if (Math.signum(cmp) != Math.signum(cmp2)) {
System.out.println("Not equal: " + cmp + " / " + cmp2);
print(prev);
print(charSeq);
print(lastStr);
print(str);
}

lastCharseq = charSeq;
lastStr = str;
}
}
prev.replace(charSeq);
lastStr = str;
}
}

public static void main(String[] args) throws Throwable {
if(args.length<1) {
System.out.println("hdtVerify <file.hdt>");
System.exit(-1);
}
try (HDT hdt = HDTManager.mapHDT(args[0], null)) {
System.out.println("Checking subject entries");
checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries());
System.out.println("Checking predicate entries");
checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries());
System.out.println("Checking object entries");
checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries());
System.out.println("Checking shared entries");
checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries());
}
}
public static void main(String[] args) throws Throwable {
if (args.length < 1) {
System.out.println("hdtVerify <file.hdt>");
System.exit(-1);
}
try (HDT hdt = HDTManager.mapHDT(args[0], null)) {
System.out.println("Checking subject entries");
checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries());
System.out.println("Checking predicate entries");
checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries());
System.out.println("Checking object entries");
checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries());
System.out.println("Checking shared entries");
checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries());
}
}
}
Loading