Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDTCatTree + HDTGenDisk #179

Merged
merged 9 commits into from
Nov 21, 2022
Next Next commit
Implement HDTCatTree to create an HDT with low resources using HDTCat
  • Loading branch information
ate47 committed Oct 21, 2022
commit 3cb975806ad4a14243071d6cc9379f3c3ee3d211
59 changes: 59 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.rdf.RDFFluxStop;
import org.rdfhdt.hdt.rdf.TripleWriter;
import org.rdfhdt.hdt.triples.TripleString;

Expand Down Expand Up @@ -339,6 +340,61 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
return HDTManager.getInstance().doHDTDiffBit(location, hdtFileName, deleteBitmap, hdtFormat, listener);
}


/**
* Create an HDT file from an RDF file in a tree, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param rdfFileName File name.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
*
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, rdfFileName, baseURI, rdfNotation, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param rdfStream Stream.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
*
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream rdfStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, rdfStream, baseURI, rdfNotation, hdtFormat, listener);
}

/**
* Create an HDT from an RDF iterator, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param iterator A provider of triples. Must implement hasNext(), next() and estimatedNumResults.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, iterator, baseURI, hdtFormat, listener);
}

// Abstract methods for the current implementation
protected abstract HDTOptions doReadOptions(String file) throws IOException;
protected abstract HDT doLoadHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException;
Expand All @@ -355,5 +411,8 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBitmap, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, String filename, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream stream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;

}
42 changes: 42 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.rdfhdt.hdt.hdt;

import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.triples.TripleString;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;

/**
* Interface describing an HDT generator method
*
* @author Antoine Willerval
*/
@FunctionalInterface
public interface HDTSupplier {
/**
* @return implementation using in-memory hdt
*/
static HDTSupplier memory() {
return (iterator, baseURI, hdtFormat, listener, location) -> {
try (HDT hdt = HDTManager.generateHDT(iterator, baseURI, hdtFormat, listener)) {
hdt.saveToHDT(location.toAbsolutePath().toString(), listener);
}
};
}

/**
* Generate the HDT
*
* @param iterator the iterator to create the hdt
* @param baseURI the base URI (useless, but asked by some methods)
* @param hdtFormat the HDT options to create the HDT
* @param listener listener
* @param location where to write the HDT
* @throws IOException io exception while creating the HDT
* @throws ParserException parser exception while retrieving the triples
*/
void doGenerateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener, Path location) throws IOException, ParserException;
}
102 changes: 102 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package org.rdfhdt.hdt.rdf;

import org.rdfhdt.hdt.triples.TripleString;

import java.io.IOException;
import java.nio.charset.StandardCharsets;

/**
* Rdf flux stopper descriptor
* @author Antoine Willerval
*/
public interface RDFFluxStop {
/**
* @return basic implementation without any limit
*/
static RDFFluxStop noLimit() {
return new RDFFluxStop() {
@Override
public boolean canHandle(TripleString ts) {
return true;
}

@Override
public void restart() {
// nothing
}
};
}

/**
* implementation of flux stop stopping after a maximum triple count
*
* @param maxTriple maximum count
* @return FluxStop
*/
static RDFFluxStop countLimit(long maxTriple) {
if (maxTriple <= 0) {
throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!");
}
return new RDFFluxStop() {
long current = 0;

@Override
public boolean canHandle(TripleString ts) {
return current++ < maxTriple;
}

@Override
public void restart() {
current = 0;
}
};
}

/**
* implementation of flux stop stopping after a maximum NTriple size
*
* @param maxSize maximum size
* @return FluxStop
*/
static RDFFluxStop sizeLimit(long maxSize) {
if (maxSize <= 0) {
throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!");
}
return new RDFFluxStop() {
long size = 0;

@Override
public boolean canHandle(TripleString ts) {
long tsSize;
try {
tsSize = ts.asNtriple().toString().getBytes(StandardCharsets.UTF_8).length;
} catch (IOException e) {
throw new RuntimeException("Can't estimate the size of the triple " + ts, e);
}
try {
return size < maxSize;
} finally {
size += tsSize;
}
}

@Override
public void restart() {
size = 0;
}
};
}

/**
* should we stop the flux after this triple or not?
*
* @param ts the triple
* @return true if the flux can handle this triple, false otherwise
*/
boolean canHandle(TripleString ts);

/**
* restart the flux stop
*/
void restart();
}
49 changes: 46 additions & 3 deletions hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,30 @@
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.hdt.HDTSupplier;
import org.rdfhdt.hdt.hdt.HDTVersion;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTSpecification;
import org.rdfhdt.hdt.rdf.RDFFluxStop;
import org.rdfhdt.hdt.util.StopWatch;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.internal.Lists;
import org.rdfhdt.hdt.util.StringUtil;

/**
* @author mario.arias
*
*/
public class RDF2HDT implements ProgressListener {
/**
* @return a theoretical maximum amount of memory the JVM will attempt to use
*/
private static long getMaxTreeCatChunkSize() {
Runtime runtime = Runtime.getRuntime();
return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) / (0.85 * 5));
}

public String rdfInput;
public String hdtOutput;
Expand Down Expand Up @@ -77,7 +87,13 @@ public class RDF2HDT implements ProgressListener {

@Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples")
public boolean ntSimpleLoading;


@Parameter(names = "-cattree", description = "Use HDTCatTree to split the HDT creation for big dataset")
public boolean catTree;

@Parameter(names = "-cattreelocation", description = "Only with -cattree, set the tree building location")
public String catTreeLocation;

public void execute() throws ParserException, IOException {
HDTSpecification spec;
if(configFile!=null) {
Expand Down Expand Up @@ -115,7 +131,30 @@ public void execute() throws ParserException, IOException {
}

StopWatch sw = new StopWatch();
HDT hdt = HDTManager.generateHDT(rdfInput, baseURI,notation , spec, this);
HDT hdt;

if (catTree) {
if (catTreeLocation != null) {
spec.set("loader.cattree.location", catTreeLocation);
}
spec.set("loader.cattree.futureHDTLocation", hdtOutput);

long maxTreeCatChunkSize = getMaxTreeCatChunkSize();

System.out.println("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true));

hdt = HDTManager.catTree(
RDFFluxStop.sizeLimit(maxTreeCatChunkSize),
HDTSupplier.memory(),
rdfInput,
baseURI,
notation,
spec,
this
);
} else {
hdt = HDTManager.generateHDT(rdfInput, baseURI, notation , spec, this);
}
System.out.println("File converted in: "+sw.stopAndShow());

try {
Expand All @@ -130,7 +169,11 @@ public void execute() throws ParserException, IOException {

// Dump to HDT file
sw = new StopWatch();
hdt.saveToHDT(hdtOutput, this);

if (!catTree) {
// ignore catTree save because the file is already here
hdt.saveToHDT(hdtOutput, this);
}
System.out.println("HDT saved to file in: "+sw.stopAndShow());

// Generate index and dump it to .hdt.index file
Expand Down
Loading