Skip to content

Commit

Permalink
Implement HDTCatTree to create an HDT with low resources using HDTCat
Browse files Browse the repository at this point in the history
  • Loading branch information
ate47 committed Sep 16, 2022
1 parent 7222f8a commit 3267a24
Show file tree
Hide file tree
Showing 18 changed files with 1,328 additions and 54 deletions.
59 changes: 59 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.rdf.RDFFluxStop;
import org.rdfhdt.hdt.rdf.TripleWriter;
import org.rdfhdt.hdt.triples.TripleString;

Expand Down Expand Up @@ -339,6 +340,61 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
return HDTManager.getInstance().doHDTDiffBit(location, hdtFileName, deleteBitmap, hdtFormat, listener);
}


/**
* Create an HDT file from an RDF file in a tree, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param rdfFileName File name.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
*
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, rdfFileName, baseURI, rdfNotation, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param rdfStream Stream.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
*
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream rdfStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, rdfStream, baseURI, rdfNotation, hdtFormat, listener);
}

/**
* Create an HDT from an RDF iterator, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param iterator A provider of triples. Must implement hasNext(), next() and estimatedNumResults.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, iterator, baseURI, hdtFormat, listener);
}

// Abstract methods for the current implementation
protected abstract HDTOptions doReadOptions(String file) throws IOException;
protected abstract HDT doLoadHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException;
Expand All @@ -355,5 +411,8 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBitmap, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, String filename, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream stream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;

}
42 changes: 42 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.rdfhdt.hdt.hdt;

import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.triples.TripleString;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;

/**
* Interface describing an HDT generator method
*
* @author Antoine Willerval
*/
@FunctionalInterface
public interface HDTSupplier {
/**
* @return implementation using in-memory hdt
*/
static HDTSupplier memory() {
return (iterator, baseURI, hdtFormat, listener, location) -> {
try (HDT hdt = HDTManager.generateHDT(iterator, baseURI, hdtFormat, listener)) {
hdt.saveToHDT(location.toAbsolutePath().toString(), listener);
}
};
}

/**
* Generate the HDT
*
* @param iterator the iterator to create the hdt
* @param baseURI the base URI (useless, but asked by some methods)
* @param hdtFormat the HDT options to create the HDT
* @param listener listener
* @param location where to write the HDT
* @throws IOException io exception while creating the HDT
* @throws ParserException parser exception while retrieving the triples
*/
void doGenerateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener, Path location) throws IOException, ParserException;
}
102 changes: 102 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package org.rdfhdt.hdt.rdf;

import org.rdfhdt.hdt.triples.TripleString;

import java.io.IOException;
import java.nio.charset.StandardCharsets;

/**
* Rdf flux stopper descriptor
* @author Antoine Willerval
*/
public interface RDFFluxStop {
/**
* @return basic implementation without any limit
*/
static RDFFluxStop noLimit() {
return new RDFFluxStop() {
@Override
public boolean canHandle(TripleString ts) {
return true;
}

@Override
public void restart() {
// nothing
}
};
}

/**
* implementation of flux stop stopping after a maximum triple count
*
* @param maxTriple maximum count
* @return FluxStop
*/
static RDFFluxStop countLimit(long maxTriple) {
if (maxTriple <= 0) {
throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!");
}
return new RDFFluxStop() {
long current = 0;

@Override
public boolean canHandle(TripleString ts) {
return current++ < maxTriple;
}

@Override
public void restart() {
current = 0;
}
};
}

/**
* implementation of flux stop stopping after a maximum NTriple size
*
* @param maxSize maximum size
* @return FluxStop
*/
static RDFFluxStop sizeLimit(long maxSize) {
if (maxSize <= 0) {
throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!");
}
return new RDFFluxStop() {
long size = 0;

@Override
public boolean canHandle(TripleString ts) {
long tsSize;
try {
tsSize = ts.asNtriple().toString().getBytes(StandardCharsets.UTF_8).length;
} catch (IOException e) {
throw new RuntimeException("Can't estimate the size of the triple " + ts, e);
}
try {
return size < maxSize;
} finally {
size += tsSize;
}
}

@Override
public void restart() {
size = 0;
}
};
}

/**
* should we stop the flux after this triple or not?
*
* @param ts the triple
* @return true if the flux can handle this triple, false otherwise
*/
boolean canHandle(TripleString ts);

/**
* restart the flux stop
*/
void restart();
}
49 changes: 46 additions & 3 deletions hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,30 @@
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.hdt.HDTSupplier;
import org.rdfhdt.hdt.hdt.HDTVersion;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTSpecification;
import org.rdfhdt.hdt.rdf.RDFFluxStop;
import org.rdfhdt.hdt.util.StopWatch;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.internal.Lists;
import org.rdfhdt.hdt.util.StringUtil;

/**
* @author mario.arias
*
*/
public class RDF2HDT implements ProgressListener {
/**
* @return a theoretical maximum amount of memory the JVM will attempt to use
*/
private static long getMaxTreeCatChunkSize() {
Runtime runtime = Runtime.getRuntime();
return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) / (0.85 * 5));
}

public String rdfInput;
public String hdtOutput;
Expand Down Expand Up @@ -77,7 +87,13 @@ public class RDF2HDT implements ProgressListener {

@Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples")
public boolean ntSimpleLoading;


@Parameter(names = "-cattree", description = "Use HDTCatTree to split the HDT creation for big dataset")
public boolean catTree;

@Parameter(names = "-cattreelocation", description = "Only with -cattree, set the tree building location")
public String catTreeLocation;

public void execute() throws ParserException, IOException {
HDTSpecification spec;
if(configFile!=null) {
Expand Down Expand Up @@ -115,7 +131,30 @@ public void execute() throws ParserException, IOException {
}

StopWatch sw = new StopWatch();
HDT hdt = HDTManager.generateHDT(rdfInput, baseURI,notation , spec, this);
HDT hdt;

if (catTree) {
if (catTreeLocation != null) {
spec.set("loader.cattree.location", catTreeLocation);
}
spec.set("loader.cattree.futureHDTLocation", hdtOutput);

long maxTreeCatChunkSize = getMaxTreeCatChunkSize();

System.out.println("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true));

hdt = HDTManager.catTree(
RDFFluxStop.sizeLimit(maxTreeCatChunkSize),
HDTSupplier.memory(),
rdfInput,
baseURI,
notation,
spec,
this
);
} else {
hdt = HDTManager.generateHDT(rdfInput, baseURI, notation , spec, this);
}
System.out.println("File converted in: "+sw.stopAndShow());

try {
Expand All @@ -130,7 +169,11 @@ public void execute() throws ParserException, IOException {

// Dump to HDT file
sw = new StopWatch();
hdt.saveToHDT(hdtOutput, this);

if (!catTree) {
// ignore catTree save because the file is already here
hdt.saveToHDT(hdtOutput, this);
}
System.out.println("HDT saved to file in: "+sw.stopAndShow());

// Generate index and dump it to .hdt.index file
Expand Down
Loading

0 comments on commit 3267a24

Please sign in to comment.