add unicode test and key print

rdfhdt · D063520 · Nov 21, 2022 · Sep 15, 2022 · Sep 15, 2022 · Oct 21, 2022
commit 73f7d2b23d71774f6d0e381d81b41cbef84f08de
diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java
@@ -3,8 +3,17 @@
 import org.rdfhdt.hdt.hdt.HDTVocabulary;
 import org.rdfhdt.hdt.rdf.RDFFluxStop;
 
+import java.lang.reflect.Field;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
 /**
  * keys usable with {@link org.rdfhdt.hdt.options.HDTOptions#set(String, String)}
+ *
  * @author Antoine Willerval
  */
 public class HDTOptionsKeys {
@@ -13,58 +22,69 @@ public class HDTOptionsKeys {
 	 * Value can be {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} or
 	 * {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE}
 	 */
+	@Key(type = Key.Type.ENUM, desc = "Compression mode")
 	public static final String LOADER_DISK_COMPRESSION_MODE_KEY = "loader.disk.compressMode";
 	/**
 	 * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower
 	 * but decrease the RAM usage. default config.
 	 */
+	@Value(value = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort all the file before going to the next step, slower but decrease the RAM usage. default config")
 	public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete";
 	/**
 	 * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next
 	 * step, faster but increase the RAM usage.
 	 */
+	@Value(value = LOADER_DISK_COMPRESSION_MODE_KEY, desc = "sort while reading all the file before going to the next step, faster but increase the RAM usage.")
 	public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial";
 
 	/**
 	 * Key for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
 	 * say the number of workers to merge the data. default to the number of processor. long value.
 	 */
+	@Key(type = Key.Type.NUMBER, desc = "Number of core used to compress the HDT")
 	public static final String LOADER_DISK_COMPRESSION_WORKER_KEY = "loader.disk.compressWorker";
 	/**
 	 * Key for the maximum size of a chunk on disk for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk
 	 * methods, the chunk should be in RAM before writing it on disk and should be sorted. long value.
 	 */
+	@Key(type = Key.Type.NUMBER, desc = "Maximum size of a chunk")
 	public static final String LOADER_DISK_CHUNK_SIZE_KEY = "loader.disk.chunkSize";
 	/**
 	 * Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
 	 * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to
 	 * set this option to delete the directory in case of an interruption of the process. file value.
 	 */
+	@Key(type = Key.Type.PATH, desc = "Location of the disk generation directory")
 	public static final String LOADER_DISK_LOCATION_KEY = "loader.disk.location";
 	/**
 	 * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
 	 * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT
 	 * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value.
 	 */
+	@Key(type = Key.Type.PATH, desc = "Location of the future HDT")
 	public static final String LOADER_DISK_FUTURE_HDT_LOCATION_KEY = "loader.disk.futureHDTLocation";
 	/**
 	 * Key for the maximum number of file opened at the same time, should be greater than {@link #LOADER_DISK_KWAY_KEY},
 	 * 1024 by default
 	 */
+	@Key(type = Key.Type.NUMBER, desc = "Maximum number of file HDTDisk can open at the same time")
 	public static final String LOADER_DISK_MAX_FILE_OPEN_KEY = "loader.disk.maxFileOpen";
 	/**
 	 * Key for the number of chunk layers opened at the same time, by default
 	 * <p>min(log2(maxFileOpen), chunkSize / (fileBufferSize * compressWorker))</p>
 	 */
+	@Key(type = Key.Type.NUMBER, desc = "log of the number of way the system can merge in genDisk")
 	public static final String LOADER_DISK_KWAY_KEY = "loader.disk.kway";
 	/**
 	 * Key for the size of the buffers when opening a file
 	 */
+	@Key(type = Key.Type.NUMBER, desc = "Size of the file buffers")
 	public static final String LOADER_DISK_BUFFER_SIZE_KEY = "loader.disk.fileBufferSize";
 	/**
 	 * Key for {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDTDisk(java.util.Iterator, String, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)},
 	 * specify that the method doesn't have to copy the triple strings between 2 calls to the iterator, default false
 	 */
+	@Key(type = Key.Type.BOOLEAN, desc = "specify that the method doesn't have to copy the triple strings between 2 calls to the iterator")
 	public static final String LOADER_DISK_NO_COPY_ITERATOR_KEY = "loader.disk.noCopyIterator";
 
 	/**
@@ -73,61 +93,74 @@ public class HDTOptionsKeys {
 	 * method, this key isn't working with the other methods.
 	 * Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS}, {@link #LOADER_TYPE_VALUE_TWO_PASS}, {@link #LOADER_TYPE_VALUE_CAT} or {@link #LOADER_TYPE_VALUE_DISK}.
 	 */
+	@Key(type = Key.Type.ENUM, desc = "HDT generation loader type")
 	public static final String LOADER_TYPE_KEY = "loader.type";
 	/**
 	 * Value for {@link #LOADER_TYPE_KEY}, read using disk generation, reduce the RAM usage and increase disk usage
 	 */
+	@Value(value = LOADER_TYPE_KEY, desc = "Using genDisk")
 	public static final String LOADER_TYPE_VALUE_DISK = "disk";
 	/**
 	 * Value for {@link #LOADER_TYPE_KEY}, read using HDTCat generation, merge using HDTCat HDT, reduce the RAM usage
 	 */
+	@Value(value = LOADER_TYPE_KEY, desc = "Using HDTCat")
 	public static final String LOADER_TYPE_VALUE_CAT = "cat";
 	/**
 	 * Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage
 	 */
+	@Value(value = LOADER_TYPE_KEY, desc = "Using two pass algorithm")
 	public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass";
 	/**
 	 * Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value
 	 */
+	@Value(value = LOADER_TYPE_KEY, desc = "Using one pass algorithm")
 	public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass";
 
 	/**
 	 * Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} catTree methods,
 	 * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to
 	 * set this option to delete the directory in case of an interruption of the process. file value.
 	 */
+	@Key(type = Key.Type.PATH, desc = "Path of the CatTree generation")
 	public static final String LOADER_CATTREE_LOCATION_KEY = "loader.cattree.location";
 	/**
 	 * Same as {@link #LOADER_TYPE_KEY} for loader in the CATTREE method
 	 */
+	@Key(desc = "Loader of the hdt generation")
 	public static final String LOADER_CATTREE_LOADERTYPE_KEY = "loader.cattree.loadertype";
 	/**
 	 * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} catTree methods,
 	 * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT
 	 * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value.
 	 */
+	@Key(type = Key.Type.PATH, desc = "Location of the future HDT")
 	public static final String LOADER_CATTREE_FUTURE_HDT_LOCATION_KEY = "loader.cattree.futureHDTLocation";
 	/**
 	 * Key for the fault factor for the {@link org.rdfhdt.hdt.hdt.HDTManager} catTree default value of the
 	 * split size of the RDFFluxStop in the generateHDT method.
 	 */
+	@Key(type = Key.Type.DOUBLE, desc = "Memory fault factor for HDTCat tree method split")
 	public static final String LOADER_CATTREE_MEMORY_FAULT_FACTOR = "loader.cattree.memoryFaultFactor";
 
 	/**
 	 * Key for the hdt supplier type, default to memory
 	 */
+	@Key(type = Key.Type.ENUM, desc = "HDTCat supplier type")
 	public static final String HDT_SUPPLIER_KEY = "supplier.type";
 	/**
 	 * Value for {@link #HDT_SUPPLIER_KEY}, use HDTGenDisk to create the HDT
 	 */
+	@Value(value = HDT_SUPPLIER_KEY, desc = "using genDisk")
 	public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_DISK = "disk";
 	/**
 	 * Value for {@link #HDT_SUPPLIER_KEY}, use the default memory implementation to create the HDT
 	 */
+	@Value(value = HDT_SUPPLIER_KEY, desc = "using gen in memory")
 	public static final String LOADER_CATTREE_HDT_SUPPLIER_VALUE_MEMORY = "memory";
 	/**
 	 * Key for the rdf flux stop type, default to the maximum memory allocated
 	 */
+	@Key(desc = "API use")
 	public static final String RDF_FLUX_STOP_KEY = "rdffluxstop.type";
 	/**
 	 * Value type for the {@link #RDF_FLUX_STOP_KEY}, using {@link RDFFluxStop#asConfig()} would be easier
@@ -157,71 +190,160 @@ public class HDTOptionsKeys {
 	/**
 	 * Key for enabling the profiler (if implemented), default to false. Boolean value
 	 */
+	@Key(type = Key.Type.BOOLEAN, desc = "Use the profiler to get the time of each section")
 	public static final String PROFILER_KEY = "profiler";
 	/**
 	 * Key for the profiler output (if implemented). File value
 	 */
+	@Key(type = Key.Type.PATH, desc = "Profiler output file")
 	public static final String PROFILER_OUTPUT_KEY = "profiler.output";
 	/**
 	 * Key for enabling the canonical NTriple file simple parser, default to false. Boolean value
 	 */
+	@Key(type = Key.Type.BOOLEAN, desc = "Use the canonical NT file parser, removing checks")
 	public static final String NT_SIMPLE_PARSER_KEY = "parser.ntSimpleParser";
 	/**
 	 * Key for setting the triple order. see {@link org.rdfhdt.hdt.enums.TripleComponentOrder}'s names to have the values
 	 * default to {@link org.rdfhdt.hdt.enums.TripleComponentOrder#SPO}
 	 */
+	@Key(type = Key.Type.STRING, desc = "HDT generation triple order")
 	public static final String TRIPLE_ORDER_KEY = "triplesOrder";
 
 	/**
 	 * Option to set how the HDTs are loaded in HDTCat/HDTDiff, default {@link #LOAD_HDT_TYPE_VALUE_MAP}
 	 */
+	@Key(type = Key.Type.ENUM, desc = "loading type for HDTCat / HDTDiff")
 	public static final String LOAD_HDT_TYPE_KEY = "loader.hdt.type";
 	/**
 	 * load the HDT file into memory
 	 */
+	@Value(value = LOAD_HDT_TYPE_KEY, desc = "load the HDTs in memory")
 	public static final String LOAD_HDT_TYPE_VALUE_LOAD = "load";
 	/**
 	 * map the HDT file, default value
 	 */
+	@Value(value = LOAD_HDT_TYPE_KEY, desc = "map the HDTs")
 	public static final String LOAD_HDT_TYPE_VALUE_MAP = "map";
 
 	/**
 	 * Implementation of the temporary dictionary
 	 */
+	@Key(type = Key.Type.ENUM, desc = "Internal temporary dictionary")
 	public static final String TEMP_DICTIONARY_IMPL_KEY = "tempDictionary.impl";
 	/**
 	 * use Hash map to create the HDT
 	 */
+	@Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary")
 	public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH = "hash";
 	/**
 	 * use Hash map to create the HDT and store the multisection dictionary, mandatory to create MSC
 	 */
+	@Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "hash dictionary with literal count")
 	public static final String TEMP_DICTIONARY_IMPL_VALUE_MULT_HASH = "multHash";
 	/**
 	 * use Hash map with Prefix AND Suffix front-coded (PSFC), mandatory to create PSFC dictionary
 	 */
+	@Value(value = TEMP_DICTIONARY_IMPL_KEY, desc = "Prefix AND Suffix front-coded (PSFC) hash dictionary")
 	public static final String TEMP_DICTIONARY_IMPL_VALUE_HASH_PSFC = "hashPsfc";
 
 	/**
 	 * Implementation of the dictionary
 	 */
+	@Key(type = Key.Type.ENUM, desc = "HDT dictionary type")
 	public static final String DICTIONARY_TYPE_KEY = "dictionary.type";
 	/**
 	 * 4 Section dictionary
 	 */
+	@Value(value = DICTIONARY_TYPE_KEY, desc = "Four sectiob dictionary")
 	public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION;
 	/**
 	 * Prefix AND Suffix front-coded (PSFC) 4 Section dictionary
 	 */
+	@Value(value = DICTIONARY_TYPE_KEY, desc = "Prefix AND Suffix front-coded (PSFC) four section dictionary")
 	public static final String DICTIONARY_TYPE_VALUE_FOUR_PSFC_SECTION = HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION;
 	/**
 	 * big 4 Section dictionary
 	 */
-	public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG ="dictionaryFourBig";
+	@Value(value = DICTIONARY_TYPE_KEY, desc = "Four section dictionary big")
+	public static final String DICTIONARY_TYPE_VALUE_FOUR_SECTION_BIG = "dictionaryFourBig";
 	/**
 	 * multi section dictionary
 	 */
+	@Value(value = DICTIONARY_TYPE_KEY, desc = "Multi section dictionary")
 	public static final String DICTIONARY_TYPE_VALUE_MULTI_OBJECTS = "dictionaryMultiObj";
 
-	private HDTOptionsKeys() {}
+	// use tree-map to have a better order
+	private static final Map<String, Option> OPTION_MAP = new TreeMap<>();
+
+	static {
+		try {
+			for (Field f : HDTOptionsKeys.class.getDeclaredFields()) {
+				Key key = f.getAnnotation(Key.class);
+				if (key != null) {
+					String keyValue = (String) f.get(null);
+
+					OPTION_MAP.put(keyValue, new Option(keyValue, key));
+				} else {
+					Value value = f.getAnnotation(Value.class);
+					if (value != null) {
+						String valueValue = (String) f.get(null);
+						Option opt = OPTION_MAP.get(value.value());
+						if (opt != null) {
+							opt.values.add(new OptionValue(valueValue, value));
+						}
+					}
+				}
+			}
+		} catch (Exception e) {
+			throw new Error("Can't load option keys", e);
+		}
+	}
+
+	public static Map<String, Option> getOptionMap() {
+		return Collections.unmodifiableMap(OPTION_MAP);
+	}
+
+	private HDTOptionsKeys() {
+	}
+
+	public static class OptionValue {
+		private final String value;
+		private final Value valueInfo;
+
+		public OptionValue(String value, Value valueInfo) {
+			this.value = value;
+			this.valueInfo = valueInfo;
+		}
+
+		public String getValue() {
+			return value;
+		}
+
+		public Value getValueInfo() {
+			return valueInfo;
+		}
+	}
+
+	public static class Option {
+		private final String key;
+		private final Key keyInfo;
+		private final List<OptionValue> values = new ArrayList<>();
+
+		public Option(String key, Key keyInfo) {
+			this.key = key;
+			this.keyInfo = keyInfo;
+		}
+
+		public String getKey() {
+			return key;
+		}
+
+		public Key getKeyInfo() {
+			return keyInfo;
+		}
+
+		public List<OptionValue> getValues() {
+			return Collections.unmodifiableList(values);
+		}
+	}
 }
diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Key.java
@@ -0,0 +1,25 @@
+package org.rdfhdt.hdt.options;
+
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+
+@Retention(RetentionPolicy.RUNTIME)
+public @interface Key {
+	enum Type {
+		STRING("String"), PATH("Path"), NUMBER("Number"), DOUBLE("Double"), BOOLEAN("Boolean"), ENUM("Enum");
+
+		private final String title;
+
+		Type(String title) {
+			this.title = title;
+		}
+
+		public String getTitle() {
+			return title;
+		}
+	}
+
+	String desc() default  "";
+
+	Type type() default Type.STRING;
+}
diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/Value.java
@@ -0,0 +1,10 @@
+package org.rdfhdt.hdt.options;
+
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+
+@Retention(RetentionPolicy.RUNTIME)
+public @interface Value {
+	String value();
+	String desc() default "";
+}
diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java
@@ -211,6 +211,7 @@ else if (c == '\\') {
                 startIdx = backSlashIdx + 2;
             }
             else if (c == 'u') {
+                // not canonical but whatever
                 // \\uxxxx
                 if (backSlashIdx + 5 >= sLength) {
                     throw new IllegalArgumentException(
@@ -230,6 +231,7 @@ else if (c == 'u') {
                 }
             }
             else if (c == 'U') {
+                // not canonical but whatever
                 // \\Uxxxxxxxx
                 if (backSlashIdx + 9 >= sLength) {
                     throw new IllegalArgumentException(
@@ -238,8 +240,10 @@ else if (c == 'U') {
                 String xx = s.substring(backSlashIdx + 2, backSlashIdx + 10);
 
                 try {
-                    c = (char)Integer.parseInt(xx, 16);
-                    sb.append(c);
+                    char[] chars = Character.toChars(Integer.parseInt(xx, 16));
+                    for (char cc : chars) {
+                        sb.append(cc);
+                    }
 
                     startIdx = backSlashIdx + 10;
                 }