Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDTCatTree to create an HDT with low resources using HDTCat #172

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Implement HDTCatTree to create an HDT with low resources using HDTCat
  • Loading branch information
ate47 committed Oct 7, 2022
commit 58371a05d5ccc48d214ea4bf322cd048b323c7fa
59 changes: 59 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.rdf.RDFFluxStop;
import org.rdfhdt.hdt.rdf.TripleWriter;
import org.rdfhdt.hdt.triples.TripleString;

Expand Down Expand Up @@ -339,6 +340,61 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
return HDTManager.getInstance().doHDTDiffBit(location, hdtFileName, deleteBitmap, hdtFormat, listener);
}


/**
* Create an HDT file from an RDF file in a tree, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param rdfFileName File name.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
*
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, rdfFileName, baseURI, rdfNotation, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param rdfStream Stream.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
*
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream rdfStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, rdfStream, baseURI, rdfNotation, hdtFormat, listener);
}

/**
* Create an HDT from an RDF iterator, stop the chunk creation with the fluxStop
*
* @param fluxStop Flux stopper
* @param supplier HDT supplier to create initial HDT before cat
* @param iterator A provider of triples. Must implement hasNext(), next() and estimatedNumResults.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @throws IOException when the file cannot be found
* @throws ParserException when the file cannot be parsed
* @return HDT
*/
public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doHDTCatTree(fluxStop, supplier, iterator, baseURI, hdtFormat, listener);
}

// Abstract methods for the current implementation
protected abstract HDTOptions doReadOptions(String file) throws IOException;
protected abstract HDT doLoadHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException;
Expand All @@ -355,5 +411,8 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTDiff(String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBitmap, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, String filename, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, InputStream stream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doHDTCatTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;

}
42 changes: 42 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.rdfhdt.hdt.hdt;

import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.triples.TripleString;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;

/**
* Interface describing an HDT generator method
*
* @author Antoine Willerval
*/
@FunctionalInterface
public interface HDTSupplier {
/**
* @return implementation using in-memory hdt
*/
static HDTSupplier memory() {
return (iterator, baseURI, hdtFormat, listener, location) -> {
try (HDT hdt = HDTManager.generateHDT(iterator, baseURI, hdtFormat, listener)) {
hdt.saveToHDT(location.toAbsolutePath().toString(), listener);
}
};
}

/**
* Generate the HDT
*
* @param iterator the iterator to create the hdt
* @param baseURI the base URI (useless, but asked by some methods)
* @param hdtFormat the HDT options to create the HDT
* @param listener listener
* @param location where to write the HDT
* @throws IOException io exception while creating the HDT
* @throws ParserException parser exception while retrieving the triples
*/
void doGenerateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener, Path location) throws IOException, ParserException;
}
146 changes: 146 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFFluxStop.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
package org.rdfhdt.hdt.rdf;

import org.rdfhdt.hdt.triples.TripleString;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.function.BinaryOperator;

/**
* Rdf flux stopper descriptor
* @author Antoine Willerval
*/
public interface RDFFluxStop {
/**
* @return basic implementation without any limit
*/
static RDFFluxStop noLimit() {
return new RDFFluxStop() {
@Override
public boolean canHandle(TripleString ts) {
return true;
}

@Override
public void restart() {
// nothing
}
};
}

/**
* implementation of flux stop stopping after a maximum triple count
*
* @param maxTriple maximum count
* @return FluxStop
*/
static RDFFluxStop countLimit(long maxTriple) {
if (maxTriple <= 0) {
throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!");
}
return new RDFFluxStop() {
long current = 0;

@Override
public boolean canHandle(TripleString ts) {
return current++ < maxTriple;
}

@Override
public void restart() {
current = 0;
}
};
}

/**
* implementation of flux stop stopping after a maximum NTriple size
*
* @param maxSize maximum size
* @return FluxStop
*/
static RDFFluxStop sizeLimit(long maxSize) {
if (maxSize <= 0) {
throw new IllegalArgumentException("Can't have a limit of 0 or a negative value!");
}
return new RDFFluxStop() {
long size = 0;

@Override
public boolean canHandle(TripleString ts) {
long tsSize;
try {
tsSize = ts.asNtriple().toString().getBytes(StandardCharsets.UTF_8).length;
} catch (IOException e) {
throw new RuntimeException("Can't estimate the size of the triple " + ts, e);
}
try {
return size < maxSize;
} finally {
size += tsSize;
}
}

@Override
public void restart() {
size = 0;
}
};
}

/**
* should we stop the flux after this triple or not?
*
* @param ts the triple
* @return true if the flux can handle this triple, false otherwise
*/
boolean canHandle(TripleString ts);

/**
* restart the flux stop
*/
void restart();

/**
* combine 2 rdf flux stop with a boolean operation
* @param fluxStop the other flux stop
* @param operator the operator
* @return rdffluxstop
* @see #and(RDFFluxStop)
* @see #or(RDFFluxStop)
*/
default RDFFluxStop booleanOp(RDFFluxStop fluxStop, BinaryOperator<Boolean> operator) {
return new RDFFluxStop() {
@Override
public boolean canHandle(TripleString ts) {
boolean left = RDFFluxStop.this.canHandle(ts);
boolean right = fluxStop.canHandle(ts);
return operator.apply(left, right);
}

@Override
public void restart() {
RDFFluxStop.this.restart();
fluxStop.restart();
}
};
}

/**
* {@link #booleanOp(RDFFluxStop, BinaryOperator)} version for AND
* @param fluxStop other flux stop
* @return rdffluxstop
*/
default RDFFluxStop and(RDFFluxStop fluxStop) {
return booleanOp(fluxStop, (a, b) -> a && b);
}

/**
* {@link #booleanOp(RDFFluxStop, BinaryOperator)} version for OR
* @param fluxStop other flux stop
* @return rdffluxstop
*/
default RDFFluxStop or(RDFFluxStop fluxStop) {
return booleanOp(fluxStop, (a, b) -> a || b);
}
}
49 changes: 46 additions & 3 deletions hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,30 @@
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.hdt.HDTSupplier;
import org.rdfhdt.hdt.hdt.HDTVersion;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTSpecification;
import org.rdfhdt.hdt.rdf.RDFFluxStop;
import org.rdfhdt.hdt.util.StopWatch;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.internal.Lists;
import org.rdfhdt.hdt.util.StringUtil;

/**
* @author mario.arias
*
*/
public class RDF2HDT implements ProgressListener {
/**
* @return a theoretical maximum amount of memory the JVM will attempt to use
*/
private static long getMaxTreeCatChunkSize() {
Runtime runtime = Runtime.getRuntime();
return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) / (0.85 * 5));
}

public String rdfInput;
public String hdtOutput;
Expand Down Expand Up @@ -77,7 +87,13 @@ public class RDF2HDT implements ProgressListener {

@Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples")
public boolean ntSimpleLoading;


@Parameter(names = "-cattree", description = "Use HDTCatTree to split the HDT creation for big dataset")
public boolean catTree;

@Parameter(names = "-cattreelocation", description = "Only with -cattree, set the tree building location")
public String catTreeLocation;

public void execute() throws ParserException, IOException {
HDTSpecification spec;
if(configFile!=null) {
Expand Down Expand Up @@ -115,7 +131,30 @@ public void execute() throws ParserException, IOException {
}

StopWatch sw = new StopWatch();
HDT hdt = HDTManager.generateHDT(rdfInput, baseURI,notation , spec, this);
HDT hdt;

if (catTree) {
if (catTreeLocation != null) {
spec.set("loader.cattree.location", catTreeLocation);
}
spec.set("loader.cattree.futureHDTLocation", hdtOutput);

long maxTreeCatChunkSize = getMaxTreeCatChunkSize();

System.out.println("Compute HDT with HDTCatTree using chunk of size: " + StringUtil.humanReadableByteCount(maxTreeCatChunkSize, true));

hdt = HDTManager.catTree(
RDFFluxStop.sizeLimit(maxTreeCatChunkSize),
HDTSupplier.memory(),
rdfInput,
baseURI,
notation,
spec,
this
);
} else {
hdt = HDTManager.generateHDT(rdfInput, baseURI, notation , spec, this);
}
System.out.println("File converted in: "+sw.stopAndShow());

try {
Expand All @@ -130,7 +169,11 @@ public void execute() throws ParserException, IOException {

// Dump to HDT file
sw = new StopWatch();
hdt.saveToHDT(hdtOutput, this);

if (!catTree) {
// ignore catTree save because the file is already here
hdt.saveToHDT(hdtOutput, this);
}
System.out.println("HDT saved to file in: "+sw.stopAndShow());

// Generate index and dump it to .hdt.index file
Expand Down
Loading