From 4e619cfde10530035231ae4a2f6cbcd06403e325 Mon Sep 17 00:00:00 2001 From: Ali Haidar Date: Tue, 11 Jan 2022 12:38:50 +0100 Subject: [PATCH 1/2] New dictionary splitting objects into subsections per literal type --- .../org/rdfhdt/hdt/dictionary/Dictionary.java | 34 +- .../java/org/rdfhdt/hdt/hdt/HDTVersion.java | 17 +- .../org/rdfhdt/hdt/hdt/HDTVocabulary.java | 15 +- .../hdt/dictionary/DictionaryFactory.java | 33 +- .../hdt/dictionary/impl/BaseDictionary.java | 29 ++ .../impl/DictionaryPFCOptimizedExtractor.java | 2 +- .../hdt/dictionary/impl/HashDictionary.java | 12 +- .../MultDictionaryPFCOptimizedExtractor.java | 116 +++++++ .../impl/MultipleBaseDictionary.java | 325 ++++++++++++++++++ .../impl/MultipleSectionDictionary.java | 232 +++++++++++++ .../impl/MultipleSectionDictionaryBig.java | 228 ++++++++++++ .../dictionary/impl/OptimizedExtractor.java | 7 + .../impl/section/HashDictionarySection.java | 64 +++- .../java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java | 18 +- .../DictionaryTranslateIteratorBuffer.java | 21 +- .../org/rdfhdt/hdt/util/CustomIterator.java | 53 +++ .../org/rdfhdt/hdt/util/LiteralsUtils.java | 65 ++++ .../string/CharSequenceCustomComparator.java | 86 +++++ .../hdt/literalsDict/HDTLiteralsDictTest.java | 48 +++ 19 files changed, 1325 insertions(+), 80 deletions(-) create mode 100755 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java create mode 100755 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java create mode 100755 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java create mode 100755 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java create mode 100755 hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/OptimizedExtractor.java create mode 100755 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java create mode 100755 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java create mode 100755 hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CharSequenceCustomComparator.java create mode 100644 hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/Dictionary.java b/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/Dictionary.java index efae4744..d6c54e07 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/Dictionary.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/dictionary/Dictionary.java @@ -28,6 +28,8 @@ import java.io.Closeable; +import java.util.HashMap; +import java.util.TreeMap; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.header.Header; @@ -65,45 +67,44 @@ public interface Dictionary extends Closeable { public long stringToId(CharSequence str, TripleComponentRole position); /** - * * Returns the number of elements in the dictionary + * Returns the number of elements in the dictionary + */ + + /** + * Returns the data type of a given literal string * - * @return long + * @param id + * The id to get the data type for + * @return String */ + public String dataTypeOfId(long id); + public long getNumberOfElements(); /** * Return the combined size of the sections of the dictionary (in bytes) - * - * @return long */ public long size(); /** * Returns the number of subjects in the dictionary. Note: Includes shared. - * - * @return long */ public long getNsubjects(); /** * Returns the number of predicates in the dictionary. - * - * @return long */ public long getNpredicates(); /** * Returns the number of objects in the dictionary. Note: Includes shared - * - * @return long */ public long getNobjects(); /** * Returns the number of subjects/objects in the dictionary. - * - * @return long */ + public long getNAllObjects(); public long getNshared(); public DictionarySection getSubjects(); @@ -112,21 +113,18 @@ public interface Dictionary extends Closeable { public DictionarySection getObjects(); + public TreeMap getAllObjects(); + public DictionarySection getShared(); /** * Fills the header with information from the dictionary - * @param header - * the header to fill - * @param rootNode - * the rdf root node */ public void populateHeader(Header header, String rootNode); /** * Returns the type of the dictionary (the way it is written onto file/held in memory) - * - * @return String + * @return */ public String getType(); } \ No newline at end of file diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVersion.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVersion.java index 422c55ff..5795c624 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVersion.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVersion.java @@ -6,19 +6,20 @@ public class HDTVersion { // Version of the actual HDT file that is generated or read. // Software must be backwards compatible with all HDT files with the same number. public static final String HDT_VERSION = "1"; - + public static final String HDT_VERSION_2 = "2"; + // Version of the accompagning .index file that is generated or read // Software must be backwards compatible with all index files with the same index and HDT version number. public static final String INDEX_VERSION = "1"; // Subreleases that are backwards compatible with both HDT and index file public static final String RELEASE_VERSION ="2"; - - public static String get_version_string(String delimiter) { - return "v" + HDT_VERSION + delimiter + INDEX_VERSION + delimiter + RELEASE_VERSION; - }; - public static String get_index_suffix(String delimiter) { - return ".index.v" + HDT_VERSION + delimiter+INDEX_VERSION; - }; + public static String get_version_string(String delimiter) { + return "v" + HDT_VERSION + delimiter + INDEX_VERSION + delimiter + RELEASE_VERSION; + }; + + public static String get_index_suffix(String delimiter) { + return ".index.v" + HDT_VERSION + delimiter+INDEX_VERSION; + }; } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVocabulary.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVocabulary.java index 6f33eee9..850ef3aa 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVocabulary.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTVocabulary.java @@ -35,7 +35,8 @@ public class HDTVocabulary { // Base public static final String HDT_BASE = ""; - + public static final String HDT_CONTAINER_2 = HDT_BASE+"HDTv" + HDTVersion.HDT_VERSION_2+">"; + public static final String HDT_HEADER = HDT_BASE+"header"; public static final String HDT_DICTIONARY_BASE = HDT_BASE+"dictionary"; public static final String HDT_DICTIONARY = HDT_DICTIONARY_BASE+">"; @@ -49,7 +50,7 @@ public class HDTVocabulary { public static final String RDF_TYPE = RDF+"type>"; public static final String DUBLIN_CORE = ""; - + // VOID public static final String VOID_BASE =""; @@ -82,6 +83,8 @@ public class HDTVocabulary { // Dictionary Types public static final String DICTIONARY_TYPE_PLAIN = HDT_DICTIONARY_BASE+"Plain>"; public static final String DICTIONARY_TYPE_FOUR_SECTION = HDT_DICTIONARY_BASE+"Four>"; + public static final String DICTIONARY_TYPE_MULT_SECTION = HDT_DICTIONARY_BASE+"Mult>"; + public static final String DICTIONARY_TYPE_FOUR_PSFC_SECTION = HDT_DICTIONARY_BASE+"FourPsfc>"; // Triples @@ -103,7 +106,7 @@ public class HDTVocabulary { public static final String TRIPLES_TYPE_PLAIN = HDT_TRIPLES_BASE+"Plain>"; public static final String TRIPLES_TYPE_COMPACT = HDT_TRIPLES_BASE+"Compact>"; public static final String TRIPLES_TYPE_BITMAP = HDT_TRIPLES_BASE+"Bitmap>"; - + // Index type public static final String INDEX_TYPE_FOQ = HDT_BASE+"indexFoQ>"; @@ -116,10 +119,10 @@ public class HDTVocabulary { // Bitmaps public static final String BITMAP_TYPE_PLAIN = HDT_BITMAP_BASE+"Plain>"; - - // Misc + + // Misc public static final String ORIGINAL_SIZE = HDT_BASE+"originalSize>"; public static final String HDT_SIZE = HDT_BASE+"hdtSize>"; - + private HDTVocabulary() {} } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java index 69c887b4..b0588c41 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryFactory.java @@ -27,11 +27,7 @@ package org.rdfhdt.hdt.dictionary; -import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary; -import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionaryBig; -import org.rdfhdt.hdt.dictionary.impl.HashDictionary; -import org.rdfhdt.hdt.dictionary.impl.PSFCFourSectionDictionary; -import org.rdfhdt.hdt.dictionary.impl.PSFCTempDictionary; +import org.rdfhdt.hdt.dictionary.impl.*; import org.rdfhdt.hdt.exceptions.IllegalFormatException; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.options.ControlInfo; @@ -40,43 +36,46 @@ /** * Factory that creates Dictionary objects - * + * */ public class DictionaryFactory { public static final String MOD_DICT_IMPL_HASH = "hash"; + public static final String MOD_DICT_IMPL_MULT_HASH = "multHash"; public static final String MOD_DICT_IMPL_HASH_PSFC = "hashPsfc"; public static final String DICTIONARY_TYPE_FOUR_SECTION_BIG ="dictionaryFourBig"; - + public static final String DICTIONARY_TYPE_MULTI_OBJECTS = "dictionaryMultiObj"; private DictionaryFactory() {} /** * Creates a default dictionary (HashDictionary) - * + * * @return Dictionary */ public static Dictionary createDefaultDictionary() throws IllegalArgumentException { return new FourSectionDictionary(new HDTSpecification()); } - + /** * Creates a default dictionary (HashDictionary) - * + * * @return Dictionary */ public static TempDictionary createTempDictionary(HDTOptions spec) { String name = spec.get("tempDictionary.impl"); - + // Implementations available in the Core if(name==null || "".equals(name) || MOD_DICT_IMPL_HASH.equals(name)) { - return new HashDictionary(spec); + return new HashDictionary(spec,false); } else if(MOD_DICT_IMPL_HASH_PSFC.equals(name)){ - return new PSFCTempDictionary(new HashDictionary(spec)); + return new PSFCTempDictionary(new HashDictionary(spec,false)); + } else if(MOD_DICT_IMPL_MULT_HASH.equals(name)){ + return new HashDictionary(spec,true); } throw new IllegalFormatException("Implementation of triples not found for "+name); } - + public static DictionaryPrivate createDictionary(HDTOptions spec) { String name = spec.get("dictionary.type"); if(name==null || HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION.equals(name)) { @@ -87,16 +86,20 @@ else if (HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION.equals(name)){ } else if (DICTIONARY_TYPE_FOUR_SECTION_BIG.equals(name)){ return new FourSectionDictionaryBig(spec); + }else if ((DICTIONARY_TYPE_MULTI_OBJECTS.equals(name))){ + return new MultipleSectionDictionary(spec); } throw new IllegalFormatException("Implementation of dictionary not found for "+name); } - + public static DictionaryPrivate createDictionary(ControlInfo ci) { String name = ci.getFormat(); if(HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION.equals(name)) { return new FourSectionDictionary(new HDTSpecification()); } else if (HDTVocabulary.DICTIONARY_TYPE_FOUR_PSFC_SECTION.equals(name)) { return new PSFCFourSectionDictionary(new HDTSpecification()); + } else if(HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION.equals(name)){ + return new MultipleSectionDictionary(new HDTSpecification()); } throw new IllegalFormatException("Implementation of dictionary not found for "+name); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java index cccd8903..f005671e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java @@ -36,6 +36,8 @@ import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.DelayedString; +import java.util.TreeMap; + /** * * This abstract class implements all general methods that are the same @@ -218,5 +220,32 @@ public CharSequence idToString(long id, TripleComponentRole role) { long localId = getLocalId(id, role); return section.extract(localId); } + @Override + public String dataTypeOfId(long id) { + try { + throw new IllegalAccessException("Method is not applicable on this dictionary"); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + return ""; + } + @Override + public TreeMap getAllObjects() { + try { + throw new IllegalAccessException("Method is not applicable on this dictionary"); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + return null; + } + @Override + public long getNAllObjects() { + try { + throw new IllegalAccessException("Method is not applicable on this dictionary"); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + return 0; + } } \ No newline at end of file diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/DictionaryPFCOptimizedExtractor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/DictionaryPFCOptimizedExtractor.java index fc924312..8672087a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/DictionaryPFCOptimizedExtractor.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/DictionaryPFCOptimizedExtractor.java @@ -4,7 +4,7 @@ import org.rdfhdt.hdt.dictionary.impl.section.PFCOptimizedExtractor; import org.rdfhdt.hdt.enums.TripleComponentRole; -public class DictionaryPFCOptimizedExtractor { +public class DictionaryPFCOptimizedExtractor implements OptimizedExtractor{ private final PFCOptimizedExtractor shared, subjects, predicates, objects; private final long numshared; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/HashDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/HashDictionary.java index a3a87880..6f41f504 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/HashDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/HashDictionary.java @@ -28,6 +28,7 @@ package org.rdfhdt.hdt.dictionary.impl; import java.io.IOException; +import java.util.HashMap; import java.util.Iterator; import org.rdfhdt.hdt.dictionary.TempDictionarySection; @@ -35,6 +36,7 @@ import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; import org.rdfhdt.hdt.util.StopWatch; /** @@ -43,13 +45,14 @@ */ public class HashDictionary extends BaseTempDictionary { - public HashDictionary(HDTOptions spec) { + boolean isCustom = false; + public HashDictionary(HDTOptions spec,boolean isCustom) { super(spec); - + this.isCustom = isCustom; // FIXME: Read types from spec subjects = new HashDictionarySection(); predicates = new HashDictionarySection(); - objects = new HashDictionarySection(); + objects = new HashDictionarySection(isCustom); shared = new HashDictionarySection(); } @@ -105,7 +108,10 @@ public void reorganize(TempTriples triples) { st.reset(); subjects.sort(); predicates.sort(); + long startTime = System.currentTimeMillis(); objects.sort(); + long endTime = System.currentTimeMillis(); + //System.out.println("Time to sort temp objects:"+(endTime - startTime)+" ms"); shared.sort(); //System.out.println("Sections sorted in "+ st.stopAndShow()); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java new file mode 100755 index 00000000..efb55009 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultDictionaryPFCOptimizedExtractor.java @@ -0,0 +1,116 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySectionMap; +import org.rdfhdt.hdt.dictionary.impl.section.PFCOptimizedExtractor; +import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.util.LiteralsUtils; + +import java.util.AbstractMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +public class MultDictionaryPFCOptimizedExtractor implements OptimizedExtractor{ + private final PFCOptimizedExtractor shared, subjects, predicates; + private final TreeMap objects; + private final long numshared; + + public MultDictionaryPFCOptimizedExtractor(MultipleSectionDictionary origDict) { + numshared=(int) origDict.getNshared(); + shared = new PFCOptimizedExtractor((PFCDictionarySectionMap) origDict.shared); + subjects = new PFCOptimizedExtractor((PFCDictionarySectionMap) origDict.subjects); + predicates = new PFCOptimizedExtractor((PFCDictionarySectionMap) origDict.predicates); + objects = new TreeMap<>(); + Iterator iterator = origDict.getAllObjects().entrySet().iterator(); + while (iterator.hasNext()){ + Map.Entry entry = (Map.Entry)iterator.next(); + objects.put((String)entry.getKey(),new PFCOptimizedExtractor((PFCDictionarySectionMap)entry.getValue())); + } + } + + @Override + public CharSequence idToString(long id, TripleComponentRole role) { + AbstractMap.SimpleEntry section = getSection(id, role); + long localId = getLocalId(id, role); + if(section.getKey().equals("NO_DATATYPE") || section.getKey().equals("section")) + return section.getValue().extract(localId); + else { + String label = section.getValue().extract(localId).toString(); + String dType = section.getKey(); + //Matcher matcher = pattern.matcher(label); + if(LiteralsUtils.containsLanguage(label)){ + return label; + }else{ + return label + "^^" + dType; + } + } + } + private AbstractMap.SimpleEntry getSection(long id, TripleComponentRole role) { + switch (role) { + case SUBJECT: + if(id<=numshared) { + return new AbstractMap.SimpleEntry<>("section",shared); + } else { + return new AbstractMap.SimpleEntry<>("section",subjects); + } + case PREDICATE: + return new AbstractMap.SimpleEntry<>("section",predicates); + case OBJECT: + if(id<= numshared) { + return new AbstractMap.SimpleEntry<>("section",shared); + } else { + Iterator hmIterator = objects.entrySet().iterator(); + // iterate over all subsections in the objects section + PFCOptimizedExtractor desiredSection = null; + String type = ""; + int count = 0; + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + PFCOptimizedExtractor subSection = (PFCOptimizedExtractor)entry.getValue(); + count+= subSection.getNumStrings(); + if(id <= numshared+count){ + desiredSection = subSection; + type = (String)entry.getKey(); + break; + } + } + return new AbstractMap.SimpleEntry<>(type,desiredSection); + } + } + throw new IllegalArgumentException(); + } + + + private long getLocalId(long id, TripleComponentRole position) { + switch (position) { + case SUBJECT: + if(id <= numshared) + return id; + else + return id - numshared; + case OBJECT: + if(id<=numshared) { + return id; + } else { + Iterator hmIterator = objects.entrySet().iterator(); + // iterate over all subsections in the objects section + long count = 0; + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + PFCOptimizedExtractor subSection = (PFCOptimizedExtractor)entry.getValue(); + count+= subSection.getNumStrings(); + if(id <= numshared+ count){ + count -= subSection.getNumStrings(); + break; + } + } + // subtract the number of elements in the shared + the subsections in the objects section + return id - count - numshared; + } + case PREDICATE: + return id; + } + + throw new IllegalArgumentException(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java new file mode 100755 index 00000000..b1f57a86 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java @@ -0,0 +1,325 @@ +package org.rdfhdt.hdt.dictionary.impl; + + +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.impl.section.PFCOptimizedExtractor; +import org.rdfhdt.hdt.enums.DictionarySectionRole; +import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.string.CompactString; +import org.rdfhdt.hdt.util.string.DelayedString; + +import java.util.AbstractMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; +import java.util.regex.Pattern; + +public abstract class MultipleBaseDictionary implements DictionaryPrivate { + + protected final HDTOptions spec; + + protected DictionarySectionPrivate subjects; + protected DictionarySectionPrivate predicates; + protected TreeMap objects; + protected DictionarySectionPrivate shared; + + public MultipleBaseDictionary(HDTOptions spec) { + this.spec = spec; + } + + protected long getGlobalId(long id, DictionarySectionRole position,CharSequence str) { + switch (position) { + case SUBJECT: + return id + shared.getNumberOfElements(); + case OBJECT: { + Iterator iter = objects.entrySet().iterator(); + int count = 0; + while (iter.hasNext()){ + Map.Entry entry = (Map.Entry)iter.next(); + count+= ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); + if(LiteralsUtils.getType(str).equals((String)entry.getKey())){ + count -= ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); + break; + } + + } + return shared.getNumberOfElements() + count+id; + } + + + case PREDICATE: + case SHARED: + return id; + default: + throw new IllegalArgumentException(); + } + } + + /* + TODO: Change the objects part to look over the sections according to some pointer + */ + protected long getLocalId(long id, TripleComponentRole position) { + switch (position) { + case SUBJECT: + if(id <= shared.getNumberOfElements()) + return id; + else + return id-shared.getNumberOfElements(); + case OBJECT: + if(id<=shared.getNumberOfElements()) { + return id; + } else { + Iterator hmIterator = objects.entrySet().iterator(); + // iterate over all subsections in the objects section + long count = 0; + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + long numElts = 0; + if(entry.getValue() instanceof DictionarySectionPrivate) + numElts = ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); + else if(entry.getValue() instanceof PFCOptimizedExtractor) + numElts = ((PFCOptimizedExtractor)entry.getValue()).getNumStrings(); + count+= numElts; + if(id <= shared.getNumberOfElements()+ count){ + count -= numElts; + break; + } + } + // subtract the number of elements in the shared + the subsections in the objects section + return id - count - shared.getNumberOfElements(); + } + case PREDICATE: + return id; + default: + throw new IllegalArgumentException(); + } + } + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#stringToId(java.lang.CharSequence, datatypes.TripleComponentRole) + */ + @Override + public long stringToId(CharSequence str, TripleComponentRole position) { + str = DelayedString.unwrap(str); + + if(str==null || str.length()==0) { + return 0; + } + + if(str instanceof String) { + // CompactString is more efficient for the binary search. + str = new CompactString(str); + } + + long ret=0; + switch(position) { + case SUBJECT: + ret = shared.locate(str); + if(ret!=0) { + return getGlobalId(ret, DictionarySectionRole.SHARED,str); + } + ret = subjects.locate(str); + if(ret!=0) { + return getGlobalId(ret, DictionarySectionRole.SUBJECT,str); + } + return -1; + case PREDICATE: + ret = predicates.locate(str); + if(ret!=0) { + return getGlobalId(ret, DictionarySectionRole.PREDICATE,str); + } + return -1; + case OBJECT: + if(str.charAt(0)!='"') { + ret = shared.locate(str); + if(ret!=0) { + return getGlobalId(ret, DictionarySectionRole.SHARED,str); + } + } + DictionarySectionPrivate subSection = getSubSection(str); + if( subSection!= null) + ret = subSection.locate(new CompactString(LiteralsUtils.removeType(str))); + else + return -1; + if(ret!=0) { + return getGlobalId(ret, DictionarySectionRole.OBJECT,str); + } + return -1; + default: + throw new IllegalArgumentException(); + } + } + + private long getNumberObjectsAllSections(){ + Iterator hmIterator = objects.entrySet().iterator(); + // iterate over all subsections in the objects section + long total = 0; + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + DictionarySectionPrivate subSection = (DictionarySectionPrivate) entry.getValue(); + total += subSection.getNumberOfElements(); + } + return total; + } + @Override + public long getNumberOfElements() { + + return subjects.getNumberOfElements()+predicates.getNumberOfElements()+getNumberObjectsAllSections()+shared.getNumberOfElements(); + } + + @Override + public long size() { + return subjects.size()+predicates.size()+objects.size()+shared.size(); + } + + @Override + public long getNsubjects() { + return subjects.getNumberOfElements()+shared.getNumberOfElements(); + } + + @Override + public long getNpredicates() { + return predicates.getNumberOfElements(); + } + + @Override + public long getNobjects() { + return getNumberObjectsAllSections()+shared.getNumberOfElements(); + } + + @Override + public long getNshared() { + return shared.getNumberOfElements(); + } + + @Override + public DictionarySection getSubjects() { + return subjects; + } + + @Override + public DictionarySection getPredicates() { + return predicates; + } + + @Override + public TreeMap getAllObjects() { + return new TreeMap<>(this.objects); + } + + @Override + public DictionarySection getObjects() { + return null; + } + + @Override + public DictionarySection getShared() { + return shared; + } + + private AbstractMap.SimpleEntry getSection(long id, TripleComponentRole role) { + switch (role) { + case SUBJECT: + if(id<=shared.getNumberOfElements()) { + return new AbstractMap.SimpleEntry<>("section",shared); + } else { + return new AbstractMap.SimpleEntry<>("section",subjects); + } + case PREDICATE: + return new AbstractMap.SimpleEntry<>("section",predicates); + case OBJECT: + if(id<=shared.getNumberOfElements()) { + return new AbstractMap.SimpleEntry<>("section",shared); + } else { + + Iterator hmIterator = objects.entrySet().iterator(); + // iterate over all subsections in the objects section + DictionarySectionPrivate desiredSection = null; + String type = ""; + int count = 0; + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + DictionarySectionPrivate subSection = (DictionarySectionPrivate)entry.getValue(); + count += subSection.getNumberOfElements(); + if(id <= shared.getNumberOfElements()+ count){ + desiredSection = subSection; + type = (String)entry.getKey(); + break; + } + } + return new AbstractMap.SimpleEntry<>(type,desiredSection); + } + default: + throw new IllegalArgumentException(); + } + } + static Pattern pattern = Pattern.compile("@[a-zA-Z0-9\\-]+$"); + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#idToString(int, datatypes.TripleComponentRole) + */ + @Override + public CharSequence idToString(long id, TripleComponentRole role) { + AbstractMap.SimpleEntry section = getSection(id, role); + long localId = getLocalId(id, role); + if(section.getKey().equals("NO_DATATYPE") || section.getKey().equals("section")) + return section.getValue().extract(localId); + else { + if(section.getValue() == null){ + // this should not happen, means that the given id wasn't found in any section + System.out.println("Error couldn't find the section for the given ID: ["+id+"]"); + return null; + }else { + String label = section.getValue().extract(localId).toString(); + String dType = section.getKey(); + //Matcher matcher = pattern.matcher(label); + if (LiteralsUtils.containsLanguage(label)) { + return label; + } else { + return label + "^^" + dType; + } + } + } + } + private DictionarySectionPrivate getSubSection(CharSequence str){ + String dataType = ""; +// if(str.toString().startsWith("\"")) { +// if(str.toString().matches("\".*\"\\^\\^<.*>")){ +// dataType = str.toString().split("\\^")[2]; +// }else{ +// dataType = "NO_DATATYPE"; +// } +// }else{ +// dataType = "NO_DATATYPE"; +// } + dataType = LiteralsUtils.getType(str); + return objects.get(dataType); + } + @Override + public String dataTypeOfId(long id) { + return getSection(id,TripleComponentRole.OBJECT).getKey(); + } + public AbstractMap.SimpleEntry getDataTypeRange(String dataType){ + if(!dataType.equals("NO_DATATYPE")) + dataType = "<"+dataType+">"; + if(objects.containsKey(dataType)) { // literals subsection exist + Iterator iter = objects.entrySet().iterator(); + int count = 0; + while (iter.hasNext()) { + Map.Entry entry = (Map.Entry) iter.next(); + count += ((DictionarySectionPrivate) entry.getValue()).getNumberOfElements(); + if (dataType.equals((String) entry.getKey())) { + count -= ((DictionarySectionPrivate) entry.getValue()).getNumberOfElements(); + break; + } + + } + long offset = shared.getNumberOfElements() + count; + long size = offset + objects.get(dataType).getNumberOfElements(); + return new AbstractMap.SimpleEntry<>(offset +1, size); + } + return new AbstractMap.SimpleEntry<>(0L,0L); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java new file mode 100755 index 00000000..8199aace --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java @@ -0,0 +1,232 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory; +import org.rdfhdt.hdt.dictionary.impl.section.HashDictionarySection; +import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySection; +import org.rdfhdt.hdt.exceptions.IllegalFormatException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.ControlInformation; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.CustomIterator; +import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.string.CompactString; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.*; + +public class +MultipleSectionDictionary extends MultipleBaseDictionary { + + + public MultipleSectionDictionary(HDTOptions spec) { + super(spec); + // FIXME: Read type from spec. + subjects = new PFCDictionarySection(spec); + predicates = new PFCDictionarySection(spec); + objects = new TreeMap(); + shared = new PFCDictionarySection(spec); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#load(hdt.dictionary.Dictionary) + */ + @Override + public void load(TempDictionary other, ProgressListener listener) { + IntermediateListener iListener = new IntermediateListener(listener); + subjects.load(other.getSubjects(), iListener); + predicates.load(other.getPredicates(), iListener); + Iterator iter = other.getObjects().getEntries(); + + HashMap literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); + if(literalsCounts.containsKey("NO_DATATYPE")) + literalsCounts.put("NO_DATATYPE",literalsCounts.get("NO_DATATYPE") - other.getShared().getNumberOfElements()); + CustomIterator customIterator = new CustomIterator(iter,literalsCounts); + long startTime = System.currentTimeMillis(); + while (customIterator.hasNext()){ + PFCDictionarySection section = new PFCDictionarySection(spec); + String type = LiteralsUtils.getType(customIterator.prev); + long numEntries = literalsCounts.get(type); + + section.load(customIterator,numEntries,listener); + long locate = section.locate(new CompactString("\"\uD83C\uDDEB\uD83C\uDDF7\"@ro")); + objects.put(type,section); + } + long endTime = System.currentTimeMillis(); + //System.out.println("Loaded objects subsections in: "+(endTime - startTime)+" ms"); + shared.load(other.getShared(), iListener); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) + */ + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + ci.setType(ControlInfo.Type.DICTIONARY); + ci.setFormat(getType()); + ci.setInt("elements", this.getNumberOfElements()); + ci.save(output); + + IntermediateListener iListener = new IntermediateListener(listener); + shared.save(output, iListener); + subjects.save(output, iListener); + predicates.save(output, iListener); + + writeLiteralsMap(output, iListener); + + } + /* + ------------------ + |len| Literal URI| + ------------------ + */ + private void writeLiteralsMap(OutputStream output,ProgressListener listener) throws IOException { + Iterator hmIterator = objects.entrySet().iterator(); + int numberOfTypes = objects.size(); + output.write(numberOfTypes); + + ArrayList types = new ArrayList<>(); + + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + String uri = (String)entry.getKey(); + output.write(uri.length()); + IOUtil.writeBuffer(output, uri.getBytes(), 0, uri.getBytes().length, listener); + types.add(uri); + } + for(String type:types){ + this.objects.get(type).save(output,listener); + } + } + private void readLiteralsMap(InputStream input,ProgressListener listener) throws IOException { + int numberOfTypes = input.read(); + ArrayList types = new ArrayList<>(); + for (int i = 0; i < numberOfTypes; i++) { + int length = input.read(); + byte[] type = IOUtil.readBuffer(input, length, listener); + types.add(new String(type)); + } + for(String type:types){ + this.objects.put(type,DictionarySectionFactory.loadFrom(input,listener)); + } + } + private void mapLiteralsMap(CountInputStream input,File f,ProgressListener listener) throws IOException { + int numberOfTypes = input.read(); + ArrayList types = new ArrayList<>(); + for (int i = 0; i < numberOfTypes; i++) { + int length = input.read(); + byte[] type = IOUtil.readBuffer(input, length, listener); + String typeStr = new String(type); + types.add(typeStr); + } + for(String type:types){ + this.objects.put(type,DictionarySectionFactory.loadFrom(input,f,listener)); + } + + } + + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#load(java.io.InputStream) + */ + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { + if(ci.getType()!=ControlInfo.Type.DICTIONARY) { + throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); + } + + IntermediateListener iListener = new IntermediateListener(listener); + + shared = DictionarySectionFactory.loadFrom(input, iListener); + subjects = DictionarySectionFactory.loadFrom(input, iListener); + predicates = DictionarySectionFactory.loadFrom(input, iListener); + + readLiteralsMap(input,listener); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { + ControlInformation ci = new ControlInformation(); + ci.load(in); + if(ci.getType()!=ControlInfo.Type.DICTIONARY) { + throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); + } + + IntermediateListener iListener = new IntermediateListener(listener); + shared = DictionarySectionFactory.loadFrom(in, f, iListener); + subjects = DictionarySectionFactory.loadFrom(in, f, iListener); + predicates = DictionarySectionFactory.loadFrom(in, f, iListener); + + mapLiteralsMap(in,f,listener); + + // Use cache only for predicates. Preload only up to 100K predicates. + // FIXME: DISABLED +// predicates = new DictionarySectionCacheAll(predicates, predicates.getNumberOfElements()<100000); + } + + @Override + public long getNAllObjects() { + Iterator hmIterator = objects.entrySet().iterator(); + long count = 0; + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + count += ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); + } + return count; + } + + @Override + public TreeMap getAllObjects() { + return new TreeMap<>(objects); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#populateHeader(hdt.header.Header, java.lang.String) + */ + @Override + public void populateHeader(Header header, String rootNode) { + header.insert(rootNode, HDTVocabulary.DICTIONARY_TYPE, getType()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSUBJECTS, getNsubjects()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMPREDICATES, getNpredicates()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMOBJECTS, getNobjects()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSHARED, getNshared()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXSUBJECTID, getMaxSubjectID()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXPREDICATEID, getMaxPredicateID()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXOBJECTTID, getMaxObjectID()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_SIZE_STRINGS, size()); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#getType() + */ + @Override + public String getType() { + return HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION; + } + + @Override + public void close() throws IOException { + shared.close(); + subjects.close(); + predicates.close(); + + // close all subsections + Iterator hmIterator = objects.entrySet().iterator(); + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + ((DictionarySectionPrivate)entry.getValue()).close(); + } + + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java new file mode 100755 index 00000000..6627c3cb --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryBig.java @@ -0,0 +1,228 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory; +import org.rdfhdt.hdt.dictionary.impl.section.HashDictionarySection; +import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySectionBig; +import org.rdfhdt.hdt.exceptions.IllegalFormatException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.ControlInformation; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.CustomIterator; +import org.rdfhdt.hdt.util.LiteralsUtils; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.*; + +public class +MultipleSectionDictionaryBig extends MultipleBaseDictionary { + + + public MultipleSectionDictionaryBig(HDTOptions spec) { + super(spec); + // FIXME: Read type from spec. + subjects = new PFCDictionarySectionBig(spec); + predicates = new PFCDictionarySectionBig(spec); + objects = new TreeMap(); + shared = new PFCDictionarySectionBig(spec); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#load(hdt.dictionary.Dictionary) + */ + @Override + public void load(TempDictionary other, ProgressListener listener) { + IntermediateListener iListener = new IntermediateListener(listener); + subjects.load(other.getSubjects(), iListener); + predicates.load(other.getPredicates(), iListener); + Iterator iter = other.getObjects().getEntries(); + + HashMap literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); + if(literalsCounts.containsKey("NO_DATATYPE")) + literalsCounts.put("NO_DATATYPE",literalsCounts.get("NO_DATATYPE") - other.getShared().getNumberOfElements()); + CustomIterator customIterator = new CustomIterator(iter,literalsCounts); + + while (customIterator.hasNext()){ + PFCDictionarySectionBig section = new PFCDictionarySectionBig(spec); + String type = LiteralsUtils.getType(customIterator.prev); + long numEntries = literalsCounts.get(type); + + section.load(customIterator,numEntries,listener); + objects.put(type,section); + } + shared.load(other.getShared(), iListener); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) + */ + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + ci.setType(ControlInfo.Type.DICTIONARY); + ci.setFormat(getType()); + ci.setInt("elements", this.getNumberOfElements()); + ci.save(output); + + IntermediateListener iListener = new IntermediateListener(listener); + shared.save(output, iListener); + subjects.save(output, iListener); + predicates.save(output, iListener); + + writeLiteralsMap(output, iListener); + + } + /* + ------------------ + |len| Literal URI| + ------------------ + */ + private void writeLiteralsMap(OutputStream output,ProgressListener listener) throws IOException { + Iterator hmIterator = objects.entrySet().iterator(); + int numberOfTypes = objects.size(); + output.write(numberOfTypes); + + ArrayList types = new ArrayList<>(); + + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + String uri = (String)entry.getKey(); + output.write(uri.length()); + IOUtil.writeBuffer(output, uri.getBytes(), 0, uri.getBytes().length, listener); + types.add(uri); + } + for(String type:types){ + this.objects.get(type).save(output,listener); + } + } + private void readLiteralsMap(InputStream input,ProgressListener listener) throws IOException { + int numberOfTypes = input.read(); + ArrayList types = new ArrayList<>(); + for (int i = 0; i < numberOfTypes; i++) { + int length = input.read(); + byte[] type = IOUtil.readBuffer(input, length, listener); + types.add(new String(type)); + } + for(String type:types){ + this.objects.put(type,DictionarySectionFactory.loadFrom(input,listener)); + } + } + private void mapLiteralsMap(CountInputStream input,File f,ProgressListener listener) throws IOException { + int numberOfTypes = input.read(); + ArrayList types = new ArrayList<>(); + for (int i = 0; i < numberOfTypes; i++) { + int length = input.read(); + byte[] type = IOUtil.readBuffer(input, length, listener); + String typeStr = new String(type); + types.add(typeStr); + } + for(String type:types){ + this.objects.put(type,DictionarySectionFactory.loadFrom(input,f,listener)); + } + + } + + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#load(java.io.InputStream) + */ + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { + if(ci.getType()!=ControlInfo.Type.DICTIONARY) { + throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); + } + + IntermediateListener iListener = new IntermediateListener(listener); + + shared = DictionarySectionFactory.loadFrom(input, iListener); + subjects = DictionarySectionFactory.loadFrom(input, iListener); + predicates = DictionarySectionFactory.loadFrom(input, iListener); + + readLiteralsMap(input,listener); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { + ControlInformation ci = new ControlInformation(); + ci.load(in); + if(ci.getType()!=ControlInfo.Type.DICTIONARY) { + throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); + } + + IntermediateListener iListener = new IntermediateListener(listener); + shared = DictionarySectionFactory.loadFrom(in, f, iListener); + subjects = DictionarySectionFactory.loadFrom(in, f, iListener); + predicates = DictionarySectionFactory.loadFrom(in, f, iListener); + + mapLiteralsMap(in,f,listener); + + // Use cache only for predicates. Preload only up to 100K predicates. + // FIXME: DISABLED +// predicates = new DictionarySectionCacheAll(predicates, predicates.getNumberOfElements()<100000); + } + + @Override + public long getNAllObjects() { + Iterator hmIterator = objects.entrySet().iterator(); + long count = 0; + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + count += ((DictionarySectionPrivate)entry.getValue()).getNumberOfElements(); + } + return count; + } + + @Override + public TreeMap getAllObjects() { + return new TreeMap<>(objects); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#populateHeader(hdt.header.Header, java.lang.String) + */ + @Override + public void populateHeader(Header header, String rootNode) { + header.insert(rootNode, HDTVocabulary.DICTIONARY_TYPE, getType()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSUBJECTS, getNsubjects()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMPREDICATES, getNpredicates()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMOBJECTS, getNobjects()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSHARED, getNshared()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXSUBJECTID, getMaxSubjectID()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXPREDICATEID, getMaxPredicateID()); +// header.insert(rootNode, HDTVocabulary.DICTIONARY_MAXOBJECTTID, getMaxObjectID()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_SIZE_STRINGS, size()); + } + + /* (non-Javadoc) + * @see hdt.dictionary.Dictionary#getType() + */ + @Override + public String getType() { + return HDTVocabulary.DICTIONARY_TYPE_MULT_SECTION; + } + + @Override + public void close() throws IOException { + shared.close(); + subjects.close(); + predicates.close(); + + // close all subsections + Iterator hmIterator = objects.entrySet().iterator(); + while (hmIterator.hasNext()){ + Map.Entry entry = (Map.Entry)hmIterator.next(); + ((DictionarySectionPrivate)entry.getValue()).close(); + } + + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/OptimizedExtractor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/OptimizedExtractor.java new file mode 100755 index 00000000..c653eac1 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/OptimizedExtractor.java @@ -0,0 +1,7 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.enums.TripleComponentRole; + +public interface OptimizedExtractor { + CharSequence idToString(long id, TripleComponentRole role); +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java index 3fcf96a9..e4eb79b5 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/HashDictionarySection.java @@ -37,7 +37,9 @@ import org.rdfhdt.hdt.dictionary.TempDictionarySection; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.util.LiteralsUtils; import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.CharSequenceCustomComparator; import org.rdfhdt.hdt.util.string.CompactString; /** @@ -50,21 +52,27 @@ public class HashDictionarySection implements TempDictionarySection { private HashMap map; private List list; private int size; - boolean sorted; - + public boolean sorted; + boolean isCustom; + private HashMap literalsCounts; /** - * + * */ + public HashDictionarySection(boolean isCustom) { + this(new HDTSpecification()); + this.isCustom = isCustom; + } public HashDictionarySection() { this(new HDTSpecification()); + this.isCustom = isCustom; } - public HashDictionarySection(HDTOptions spec) { map = new HashMap<>(); list = new ArrayList<>(); size=0; + literalsCounts = new HashMap<>(); } - + /* (non-Javadoc) * @see hdt.dictionary.DictionarySection#locate(java.lang.CharSequence) */ @@ -115,56 +123,70 @@ public Iterator getSortedEntries() { } return list.iterator(); } - + @Override public Iterator getEntries() { return list.iterator(); } @Override - public long add(CharSequence entry) { + public long add(CharSequence entry) { CharSequence compact = new CompactString(entry); Long pos = map.get(compact); if(pos!=null) { // Found return existing ID. return pos; } - + // Not found, insert new list.add(compact); map.put(compact, (long) list.size()); - + size+=compact.length(); sorted = false; - + + // custom for subsection literals .. + if(isCustom){ + String type = LiteralsUtils.getType(entry); + // check if the entry doesn't already exists + if(map.get(entry) == null) { + if (literalsCounts.containsKey(type)) { + literalsCounts.put(type, literalsCounts.get(type) + 1L); + } else + literalsCounts.put(type, 1L); + } + } return list.size(); } @Override - public void remove(CharSequence seq) { + public void remove(CharSequence seq) { map.remove(seq); sorted = false; } - + @Override - public void sort() { + public void sort() { // Update list. list = new ArrayList<>(map.size()); for(CharSequence str : map.keySet()) { list.add(str); } - + // Sort list - Collections.sort(list, new CharSequenceComparator()); - + if(isCustom) + Collections.sort(list, new CharSequenceCustomComparator()); + else + Collections.sort(list, new CharSequenceComparator()); + // Update map indexes for(long i=1;i<=getNumberOfElements();i++) { map.put(extract(i), i); } - + sorted = true; } - + @Override public boolean isSorted() { return sorted; @@ -177,10 +199,14 @@ public void clear() { size=0; sorted = false; //because if sorted won't be anymore } - + @Override public void close() throws IOException { map=null; list=null; } + + public HashMap getLiteralsCounts() { + return literalsCounts; + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java index 131c8954..230205b0 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java @@ -49,6 +49,7 @@ import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary; import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionaryBig; import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionaryCat; +import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; import org.rdfhdt.hdt.enums.ResultEstimationType; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.exceptions.IllegalFormatException; @@ -165,8 +166,8 @@ public void loadFromHDT(InputStream input, ProgressListener listener) throws IOE ci.clear(); ci.load(input); String hdtFormat = ci.getFormat(); - if(!hdtFormat.equals(HDTVocabulary.HDT_CONTAINER)) { - throw new IllegalFormatException("This software (v" + HDTVersion.HDT_VERSION + ".x.x) cannot open this version of HDT File (" + hdtFormat + ")"); + if(!hdtFormat.equals(HDTVocabulary.HDT_CONTAINER) && !hdtFormat.equals(HDTVocabulary.HDT_CONTAINER_2)) { + throw new IllegalFormatException("This software (v" + HDTVersion.HDT_VERSION + ".x.x | v"+HDTVersion.HDT_VERSION_2+".x.x) cannot open this version of HDT File (" + hdtFormat + ")"); } // Load header @@ -248,8 +249,8 @@ public void mapFromHDT(File f, long offset, ProgressListener listener) throws IO ci.clear(); ci.load(input); String hdtFormat = ci.getFormat(); - if(!hdtFormat.equals(HDTVocabulary.HDT_CONTAINER)) { - throw new IllegalFormatException("This software (v" + HDTVersion.HDT_VERSION + ".x.x) cannot open this version of HDT File (" + hdtFormat + ")"); + if(!hdtFormat.equals(HDTVocabulary.HDT_CONTAINER) && !hdtFormat.equals(HDTVocabulary.HDT_CONTAINER_2)) { + throw new IllegalFormatException("This software (v" + HDTVersion.HDT_VERSION + ".x.x | v"+HDTVersion.HDT_VERSION_2+".x.x) cannot open this version of HDT File (" + hdtFormat + ")"); } // Load header @@ -374,10 +375,15 @@ public long estimatedNumResults() { } }; } - + if(isMapped) { try { - return new DictionaryTranslateIteratorBuffer(triples.search(triple), (FourSectionDictionary) dictionary, subject, predicate, object); + if(dictionary instanceof MultipleSectionDictionary){ + return new DictionaryTranslateIteratorBuffer(triples.search(triple), (MultipleSectionDictionary) dictionary, subject, predicate, object); + }else{ + return new DictionaryTranslateIteratorBuffer(triples.search(triple), (FourSectionDictionary) dictionary, subject, predicate, object); + + } }catch(NullPointerException e) { e.printStackTrace(); return new DictionaryTranslateIterator(triples.search(triple), dictionary, subject, predicate, object); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/DictionaryTranslateIteratorBuffer.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/DictionaryTranslateIteratorBuffer.java index 96cf91e7..56bf2f72 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/DictionaryTranslateIteratorBuffer.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/DictionaryTranslateIteratorBuffer.java @@ -34,8 +34,7 @@ import java.util.List; import java.util.Map; -import org.rdfhdt.hdt.dictionary.impl.DictionaryPFCOptimizedExtractor; -import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.*; import org.rdfhdt.hdt.enums.ResultEstimationType; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.triples.IteratorTripleID; @@ -53,7 +52,7 @@ public class DictionaryTranslateIteratorBuffer implements IteratorTripleString { final int blockSize; IteratorTripleID iterator; - DictionaryPFCOptimizedExtractor dictionary; + OptimizedExtractor dictionary; CharSequence s, p, o; List triples; @@ -67,7 +66,11 @@ public class DictionaryTranslateIteratorBuffer implements IteratorTripleString { public DictionaryTranslateIteratorBuffer(IteratorTripleID iteratorTripleID, FourSectionDictionary dictionary, CharSequence s, CharSequence p, CharSequence o) { this(iteratorTripleID,dictionary,s,p,o,DEFAULT_BLOCK_SIZE); } - + public DictionaryTranslateIteratorBuffer(IteratorTripleID iteratorTripleID, MultipleSectionDictionary dictionary, CharSequence s, CharSequence p, CharSequence o) { + this(iteratorTripleID,dictionary,s,p,o,DEFAULT_BLOCK_SIZE); + } + + public DictionaryTranslateIteratorBuffer(IteratorTripleID iteratorTripleID, FourSectionDictionary dictionary, CharSequence s, CharSequence p, CharSequence o, int blockSize) { this.blockSize = blockSize; this.iterator = iteratorTripleID; @@ -77,6 +80,16 @@ public DictionaryTranslateIteratorBuffer(IteratorTripleID iteratorTripleID, Four this.p = p==null ? "" : p; this.o = o==null ? "" : o; } + public DictionaryTranslateIteratorBuffer(IteratorTripleID iteratorTripleID, MultipleSectionDictionary dictionary, CharSequence s, CharSequence p, CharSequence o, int blockSize) { + this.blockSize = blockSize; + this.iterator = iteratorTripleID; + this.dictionary = new MultDictionaryPFCOptimizedExtractor(dictionary); + + this.s = s==null ? "" : s; + this.p = p==null ? "" : p; + this.o = o==null ? "" : o; + } + private void reset() { triples = new ArrayList(blockSize); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java new file mode 100755 index 00000000..bdadc045 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/CustomIterator.java @@ -0,0 +1,53 @@ +package org.rdfhdt.hdt.util; + +import java.util.HashMap; +import java.util.Iterator; + +public class CustomIterator implements Iterator { + public CharSequence prev = ""; + boolean first = true; + Iterator iter; + HashMap literalsCounts; + private long currCount; + public CustomIterator(Iterator iter, HashMap literalsCounts) { + this.iter = iter; + this.literalsCounts = literalsCounts; + if(iter.hasNext()) { + prev = iter.next(); + currCount = literalsCounts.get(LiteralsUtils.getType(prev)); + currCount--; + } else { + first = false; + } + } + + @Override + public boolean hasNext() { + if(currCount == 0){ + if(first) + return true; + if(iter.hasNext()){ + prev = iter.next(); + currCount = literalsCounts.get(LiteralsUtils.getType(prev)); + currCount--; + first = true; + } + return false; + }else{ + return true; + } + } + + @Override + public CharSequence next() { + if(first) { + first = false; + return LiteralsUtils.removeType(prev); + } + else { + prev = iter.next(); + currCount--; + return LiteralsUtils.removeType(prev); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java new file mode 100755 index 00000000..2b8d1a61 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/LiteralsUtils.java @@ -0,0 +1,65 @@ +package org.rdfhdt.hdt.util; + +import org.apache.jena.graph.Node; +import org.rdfhdt.hdt.rdf.parsers.JenaNodeCreator; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class LiteralsUtils { + static Pattern pattern = Pattern.compile("\".*\"\\^\\^<.*>"); + + + public static boolean containsLanguage(String str){ + Node node = JenaNodeCreator.createLiteral(str); + String lang = node.getLiteralLanguage(); + return !lang.equals(""); + } + public static String getType(CharSequence str){ + + Node node; + char firstChar = str.charAt(0); + // TODO split blank nodes as well in a seperate section +// if(firstChar=='_') { +// node = JenaNodeCreator.createAnon(str.toString()); +// } + if(firstChar=='"') { + node = JenaNodeCreator.createLiteral(str.toString()); + String dataType = node.getLiteralDatatypeURI(); + return "<"+dataType+">"; + }else{ + return "NO_DATATYPE"; + } + +// Matcher matcher = pattern.matcher(str); +// String dataType; +// if(matcher.find()){ +// dataType = str.toString().split("\\^")[2]; +// }else{ +// dataType = "NO_DATATYPE"; +// } +// return dataType; + } + public static String removeType(CharSequence str){ + String res = ""; +// char firstChar = str.charAt(0); +// if(firstChar == '"'){ +// Node node = JenaNodeCreator.createLiteral(str.toString()); +// res = node.getLiteralValue().toString(); +// String str1 = node.getLiteral().toString(); +// return res; +// } +// return str.toString(); + Matcher matcher = pattern.matcher(str); + if(matcher.matches()){ + String temp = str.toString(); + int index = temp.lastIndexOf("^"); + res = temp.substring(0,index-1); + + //res = str.toString().split("\\^")[0]; + }else{ + res = str.toString(); + } + return res; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CharSequenceCustomComparator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CharSequenceCustomComparator.java new file mode 100755 index 00000000..dd535624 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/CharSequenceCustomComparator.java @@ -0,0 +1,86 @@ +/* + * File: $HeadURL: https://hdt-java.googlecode.com/svn/trunk/hdt-java/src/org/rdfhdt/hdt/util/string/CharSequenceComparator.java $ + * Revision: $Rev: 200 $ + * Last modified: $Date: 2013-04-17 23:36:44 +0100 (mi, 17 abr 2013) $ + * Last modified by: $Author: mario.arias $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 3.0 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Contacting the authors: + * Mario Arias: mario.arias@deri.org + * Javier D. Fernandez: jfergar@infor.uva.es + * Miguel A. Martinez-Prieto: migumar2@infor.uva.es + */ + +package org.rdfhdt.hdt.util.string; + +import org.rdfhdt.hdt.util.LiteralsUtils; + +import java.util.Comparator; + +/** + * @author mario.arias + * + */ +public final class CharSequenceCustomComparator implements Comparator { + + private static final Comparator instance = new CharSequenceCustomComparator(); + + public static Comparator getInstance() { + return instance; + } + + /* (non-Javadoc) + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(CharSequence s1, CharSequence s2) { + if (s1 == s2) { + return 0; + } + String type1 = LiteralsUtils.getType(s1); + String type2 = LiteralsUtils.getType(s2); + int x = type1.compareTo(type2); + if (x != 0) { + return x; + } else { // data types are equal + s1 = DelayedString.unwrap(s1); + s2 = DelayedString.unwrap(s2); + + if (s1 instanceof CompactString && s2 instanceof CompactString) { + CompactString cs1 = (CompactString) s1; + CompactString cs2 = (CompactString) s2; + return cs1.compareTo(cs2); + } + + if (s1 instanceof String && s2 instanceof String) { + String rs1 = (String) s1; + String rs2 = (String) s2; + return rs1.compareTo(rs2); + } + + if (s1 instanceof ReplazableString && s2 instanceof ReplazableString) { + ReplazableString cs1 = (ReplazableString) s1; + ReplazableString cs2 = (ReplazableString) s2; + return cs1.compareTo(cs2); + } + + // Slower but safe + + return s1.toString().compareTo(s2.toString()); + } + } + +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java new file mode 100644 index 00000000..66875163 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java @@ -0,0 +1,48 @@ +package org.rdfhdt.hdt.literalsDict; + +import org.junit.Test; +import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.TripleString; + +import java.io.File; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; + +public class HDTLiteralsDictTest { + + @Test + public void testIdConversion(){ + ClassLoader classLoader = getClass().getClassLoader(); + String file1 = classLoader.getResource("example4+5.nt").getFile(); + HDTSpecification spec = new HDTSpecification(); + spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); + try { + HDT hdt1 = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null); + IteratorTripleString iterator = hdt1.search("","",""); + while (iterator.hasNext()){ + TripleString next = iterator.next(); + System.out.println(next); + + long subId = hdt1.getDictionary().stringToId(next.getSubject().toString(), TripleComponentRole.SUBJECT); + String subj = hdt1.getDictionary().idToString(subId,TripleComponentRole.SUBJECT).toString(); + assertEquals(next.getSubject(), subj); + + long predId = hdt1.getDictionary().stringToId(next.getPredicate().toString(), TripleComponentRole.PREDICATE); + String pred = hdt1.getDictionary().idToString(predId,TripleComponentRole.PREDICATE).toString(); + assertEquals(next.getPredicate(), pred); + + long objId = hdt1.getDictionary().stringToId(next.getObject().toString(), TripleComponentRole.OBJECT); + String obj = hdt1.getDictionary().idToString(objId,TripleComponentRole.OBJECT).toString(); + assertEquals(next.getObject(), obj); + } + } catch (Exception e) { + e.printStackTrace(); + } + } +} From fb33ab9bfe944f614db96d9926dccc991bcb592c Mon Sep 17 00:00:00 2001 From: Ali Haidar Date: Tue, 11 Jan 2022 15:17:05 +0100 Subject: [PATCH 2/2] added tests for the extra functionaltities --- .../hdt/literalsDict/HDTLiteralsDictTest.java | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java index 66875163..00b28993 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/literalsDict/HDTLiteralsDictTest.java @@ -1,15 +1,21 @@ package org.rdfhdt.hdt.literalsDict; import org.junit.Test; +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.hdtCat.utils.Utility; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.triples.IteratorTripleString; import org.rdfhdt.hdt.triples.TripleString; import java.io.File; +import java.io.IOException; +import java.util.AbstractMap; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; @@ -45,4 +51,50 @@ public void testIdConversion(){ e.printStackTrace(); } } + @Test + public void testGetDataTypeRange(){ + ClassLoader classLoader = getClass().getClassLoader(); + String file1 = classLoader.getResource("example22.nt").getFile(); + HDTSpecification spec = new HDTSpecification(); + spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); + try { + HDT hdt = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null); + Dictionary dictionary = hdt.getDictionary(); + AbstractMap.SimpleEntry dataTypeRange = ((MultipleSectionDictionary) dictionary).getDataTypeRange("http://www.w3.org/2001/XMLSchema#float"); + long lower = dataTypeRange.getKey(); + long upper = dataTypeRange.getValue(); + Utility.printTriples(hdt); + assertEquals(5,lower); + assertEquals(7,upper); + + } catch (IOException e) { + e.printStackTrace(); + } catch (ParserException e) { + e.printStackTrace(); + } + } + @Test + public void testGetDataTypeOfId(){ + ClassLoader classLoader = getClass().getClassLoader(); + String file1 = classLoader.getResource("example22.nt").getFile(); + HDTSpecification spec = new HDTSpecification(); + spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); + try { + HDT hdt = HDTManager.generateHDT(new File(file1).getAbsolutePath(), "uri", RDFNotation.NTRIPLES, spec, null); + Dictionary dictionary = hdt.getDictionary(); + + // first get the id of a given string + long id = dictionary.stringToId("\"Ali Haidar\"@en",TripleComponentRole.OBJECT); + // by default of there is no string datatype in the rdf file, the dictionary will create a section for the + // strings + assertEquals("",dictionary.dataTypeOfId(id)); + + + + } catch (IOException e) { + e.printStackTrace(); + } catch (ParserException e) { + e.printStackTrace(); + } + } }