Skip to content

Commit

Permalink
using Sequence instead of LongArrayDisk to reduce disk usage
Browse files Browse the repository at this point in the history
  • Loading branch information
ate47 committed May 16, 2022
1 parent 40bdf0b commit 929f38d
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -111,16 +111,16 @@ public CompressFourSectionDictionary(CompressionResult compressionResult, NodeCo

// send to the consumer the element while parsing them
this.subject = new OneReadDictionarySection(subject.mapWithId((node, index) -> {
nodeConsumer.onSubject(node.getIndex(), index + 1);
nodeConsumer.onSubject(node.getIndex(), CompressUtil.getHeaderId(index + 1));
return node.getNode();
}), subjects);
this.predicate = new OneReadDictionarySection(new MapIterator<>(sortedPredicate, (node, index) -> {
nodeConsumer.onPredicate(node.getIndex(), index + 1);
nodeConsumer.onPredicate(node.getIndex(), CompressUtil.getHeaderId(index + 1));
// force duplication because it's not made in a pipe like with the others
return node.getNode().toString();
}), predicates);
this.object = new OneReadDictionarySection(object.mapWithId((node, index) -> {
nodeConsumer.onObject(node.getIndex(), index + 1);
nodeConsumer.onObject(node.getIndex(), CompressUtil.getHeaderId(index + 1));
return node.getNode();
}), objects);
this.shared = new OneReadDictionarySection(shared.mapWithId((node, index) -> {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package org.rdfhdt.hdt.hdt.impl.diskimport;

import org.rdfhdt.hdt.compact.sequence.Sequence;
import org.rdfhdt.hdt.compact.sequence.SequenceLog64BigDisk;
import org.rdfhdt.hdt.dictionary.impl.CompressFourSectionDictionary;
import org.rdfhdt.hdt.util.BitUtil;
import org.rdfhdt.hdt.util.disk.LongArrayDisk;
import org.rdfhdt.hdt.util.io.CloseSuppressPath;
import org.rdfhdt.hdt.util.io.IOUtil;
Expand All @@ -12,13 +15,14 @@

/**
* Map a compress triple file to long array map files
*
* @author Antoine Willerval
*/
public class CompressTripleMapper implements CompressFourSectionDictionary.NodeConsumer {
private static final Logger log = LoggerFactory.getLogger(CompressTripleMapper.class);
private final LongArrayDisk subjects;
private final LongArrayDisk predicates;
private final LongArrayDisk objects;
private final SequenceLog64BigDisk subjects;
private final SequenceLog64BigDisk predicates;
private final SequenceLog64BigDisk objects;
private final CloseSuppressPath locationSubjects;
private final CloseSuppressPath locationPredicates;
private final CloseSuppressPath locationObjects;
Expand All @@ -28,9 +32,10 @@ public CompressTripleMapper(CloseSuppressPath location, long tripleCount) {
locationSubjects = location.resolve("map_subjects");
locationPredicates = location.resolve("map_predicates");
locationObjects = location.resolve("map_objects");
subjects = new LongArrayDisk(locationSubjects.toAbsolutePath().toString(), tripleCount + 2);
predicates = new LongArrayDisk(locationPredicates.toAbsolutePath().toString(), tripleCount + 2);
objects = new LongArrayDisk(locationObjects.toAbsolutePath().toString(), tripleCount + 2);
int numbits = BitUtil.log2(tripleCount + 2) + CompressUtil.INDEX_SHIFT;
subjects = new SequenceLog64BigDisk(locationSubjects.toAbsolutePath().toString(), numbits, tripleCount + 2);
predicates = new SequenceLog64BigDisk(locationPredicates.toAbsolutePath().toString(), numbits, tripleCount + 2);
objects = new SequenceLog64BigDisk(locationObjects.toAbsolutePath().toString(), numbits, tripleCount + 2);
}

/**
Expand Down Expand Up @@ -64,15 +69,15 @@ public void onObject(long preMapId, long newMapId) {
objects.set(preMapId, newMapId);
}

public LongArrayDisk getSubjects() {
public Sequence getSubjects() {
return subjects;
}

public LongArrayDisk getPredicates() {
public Sequence getPredicates() {
return predicates;
}

public LongArrayDisk getObjects() {
public Sequence getObjects() {
return objects;
}

Expand All @@ -88,6 +93,7 @@ private void checkShared() {

/**
* extract the map id of a subject
*
* @param id id
* @return new id
*/
Expand All @@ -97,6 +103,7 @@ public long extractSubject(long id) {

/**
* extract the map id of a predicate
*
* @param id id
* @return new id
*/
Expand All @@ -106,19 +113,20 @@ public long extractPredicate(long id) {

/**
* extract the map id of a object
*
* @param id id
* @return new id
*/
public long extractObjects(long id) {
return extract(objects, id);
}

private long extract(LongArrayDisk array, long id) {
private long extract(Sequence array, long id) {
checkShared();
long data = array.get(id);
// loop over the duplicates
while (CompressUtil.isDuplicated(data)) {
long remap = array.get(CompressUtil.getDuplicatedIndex(data));
long remap = array.get(CompressUtil.getId(data));
assert remap != data : "remap and data are the same!";
data = remap;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@ public class CompressUtil {
/**
* the mask for shared computed compressed node
*/
public static final long SHARED_MASK = 1L << 62;
public static final long SHARED_MASK = 1L;
/**
* mask for duplicated node
*/
public static final long DUPLICATE_MASK = 1L << 61;
public static final long DUPLICATE_MASK = 1L << 1;
/**
* shift after the SHARED/DUPLICATES
*/
public static final int INDEX_SHIFT = 2;

/**
* write a sorted list of indexed node
Expand Down Expand Up @@ -84,10 +88,10 @@ public static void mergeCompressedSection(InputStream stream1, InputStream strea
public static long computeSharedNode(long id, long sharedCount) {
if ((id & SHARED_MASK) != 0) {
// shared element
return id & ~SHARED_MASK;
return CompressUtil.getId(id);
}
// not shared
return id + sharedCount;
return CompressUtil.getId(id) + sharedCount;
}

/**
Expand All @@ -97,7 +101,7 @@ public static long computeSharedNode(long id, long sharedCount) {
* @return shared-computable element
*/
public static long asShared(long id) {
return id | SHARED_MASK;
return getHeaderId(id) | SHARED_MASK;
}

/**
Expand All @@ -107,7 +111,7 @@ public static long asShared(long id) {
* @return duplicated computable element
*/
public static long asDuplicated(long id) {
return id | DUPLICATE_MASK;
return getHeaderId(id) | DUPLICATE_MASK;
}

/**
Expand All @@ -121,13 +125,21 @@ public static boolean isDuplicated(long id) {
}

/**
* get the value of this duplicated id
*
* @param id duplicated id
* @return next mapping
* get the id from a header id
* @param headerId the header id
* @return the id
*/
public static long getId(long headerId) {
return headerId >>> INDEX_SHIFT;
}

/**
* get a header id from an id
* @param id the id
* @return the header id
*/
public static long getDuplicatedIndex(long id) {
return id & ~DUPLICATE_MASK;
public static long getHeaderId(long id) {
return id << INDEX_SHIFT;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ public void bitMappingTest() {
long sharedIndex1 = CompressUtil.asShared(index1);

Assert.assertEquals(index1, CompressUtil.computeSharedNode(sharedIndex1, sharedCount));
Assert.assertEquals(sharedCount + index1, CompressUtil.computeSharedNode(index1, sharedCount));
Assert.assertEquals(sharedCount + index1, CompressUtil.computeSharedNode(CompressUtil.getHeaderId(index1), sharedCount));

long dupeIndex1 = CompressUtil.asDuplicated(index1);
long dupeSharedIndex1 = CompressUtil.asDuplicated(sharedIndex1);

Assert.assertTrue(CompressUtil.isDuplicated(dupeIndex1));
Assert.assertTrue(CompressUtil.isDuplicated(dupeSharedIndex1));

Assert.assertEquals(index1, CompressUtil.getDuplicatedIndex(dupeIndex1));
Assert.assertEquals(sharedIndex1, CompressUtil.getDuplicatedIndex(dupeSharedIndex1));
Assert.assertEquals(index1, CompressUtil.getId(dupeIndex1));
Assert.assertEquals(sharedIndex1, CompressUtil.getId(dupeSharedIndex1));
}
}

0 comments on commit 929f38d

Please sign in to comment.