Skip to content

Commit

Permalink
Merge options updated and some more error catching
Browse files Browse the repository at this point in the history
Some filter links were updated.
Subjects with retrieval problems are reported in the logs, process
continues without them. Some counting reported in the logs.
  • Loading branch information
ljgarcia committed Nov 14, 2014
1 parent 1914964 commit 089f1ec
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 40 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
.settings/
deploy/
logs/
output/
target/
.project
.classpath
.classpath
build.xml
86 changes: 51 additions & 35 deletions src/main/java/uk/ac/ebi/rdf/linksets/LinksetsExtraction.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
*/
package uk.ac.ebi.rdf.linksets;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStream;

Expand All @@ -26,6 +27,7 @@
import com.hp.hpl.jena.query.QuerySolution;
import com.hp.hpl.jena.query.ResultSet;
import com.hp.hpl.jena.query.Syntax;
import com.hp.hpl.jena.rdf.model.AnonId;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
Expand Down Expand Up @@ -56,6 +58,7 @@ public class LinksetsExtraction {
private String endpoint;
private String output;
private int offset;
private int limit;
private ResultSet allTypesResultset;
private Model model;
private Resource dataset;
Expand All @@ -67,10 +70,11 @@ public class LinksetsExtraction {
private Property objectsTargetProp;
private Property subsetProp;

public LinksetsExtraction(String datasetURI, String endpoint, String output, int offset) {
public LinksetsExtraction(String datasetURI, String endpoint, String output, int offset, int limit) {
this.endpoint = endpoint;
this.output = output;
this.offset = offset;
this.limit = limit;
this.model = ModelFactory.createDefaultModel();
this.model.setNsPrefixes(Prefix.getPrefixes());
this.type = model.getProperty(Configuration.RDF_TYPE_PROP);
Expand Down Expand Up @@ -107,7 +111,11 @@ private String getQueryAllTypes() {
" ?entity rdf:type ?" + LinksetsExtraction.ENTITY_TYPE + " . " +
" ?" + LinksetsExtraction.ENTITY_TYPE + " rdf:type owl:Class . " +
"} ORDER BY ?" + LinksetsExtraction.ENTITY_TYPE + " OFFSET " + this.offset;
return query;
if (this.offset != 0) {
return query + " LIMIT " + this.limit;
} else {
return query;
}
}

/**
Expand Down Expand Up @@ -147,10 +155,10 @@ private int getTypesNumber() {
httpQuery.close();
return count;
} catch (Exception nfe) {
logger.warn("It was not possible to parse the total number of types in the dataset");
logger.warn("It was not possible to parse the total number of types in the dataset from OFFSET " + this.offset + " to LIMIT " + this.limit);
}
} else {
logger.warn("It was not possible to retrieve the total number of types in the dataset");
logger.warn("It was not possible to retrieve the total number of types in the from OFFSET " + this.offset + " to LIMIT " + this.limit);
}
httpQuery.close();
return 0;
Expand All @@ -174,20 +182,20 @@ private QueryEngineHTTP retrieveAllTypes() {
/**
* Retrieves (s, p, o) triples for all types in the dataset.
* @return Number of types effectively processed.
* @throws FileNotFoundException
* @throws Exception
*/
private int retrieveOneByOne() throws Exception {
int processed = 0;
int toLinkset = 0;
private int retrieveOneByOne() throws FileNotFoundException {
int processed = 0, processedSubjects = 0, processedTriples = 0;
String type = null;
try {
//Query one by one except those to be filtered out. If the one to be filtered out is an object, filter out that triple from results.
while (this.allTypesResultset.hasNext()) {
type = this.allTypesResultset.next().get(LinksetsExtraction.ENTITY_TYPE).toString();
if (!Configuration.shouldBeExcluded(type)) {
//Query one by one except those to be filtered out. If the one to be filtered out is an object, filter out that triple from results.
while (this.allTypesResultset.hasNext()) {
type = this.allTypesResultset.next().get(LinksetsExtraction.ENTITY_TYPE).toString();
if (!Configuration.shouldBeExcluded(type)) {
try {
Query query = QueryFactory.create(this.getQueryAType(type), Syntax.syntaxARQ);
QueryEngineHTTP httpQuery = new QueryEngineHTTP(this.endpoint, query);
ResultSet results = httpQuery.execSelect();
ResultSet results = httpQuery.execSelect();
while (results.hasNext()) {
//System.out.println("has results");
QuerySolution solution = results.next();
Expand All @@ -204,29 +212,33 @@ private int retrieveOneByOne() throws Exception {
String mergedObj = Configuration.merge(objectNode);
mergedObj = mergedObj == null ? objectNode : mergedObj;
this.addLinksetToModel(mergedSub, predicateNode, mergedObj);
toLinkset++;
processedTriples++;
} catch (MatcherExcepction e) {
logger.warn("WARN - Ommitted triple (" + subjectNode + ", " + predicateNode + ", " + objectNode + "): " + e.getMessage());
}
}
httpQuery.close();
}
processed++;
if (processed % LinksetsExtraction.LIMIT == 0) {
OutputStream output = new FileOutputStream(this.output + ".temp");
this.model.write(output, "RDF/XML-ABBREV");
logger.info("Processed so far " + processed + " types, from which only " + toLinkset + " are suitable for linksets.");
}
processedSubjects++;
httpQuery.close();
} catch (Exception e) {
e.printStackTrace();
logger.error("FATAL unexpected error, absolute OFFSET " + (processed + this.offset) + ", relative OFFSET " + processed + ".\n"
+ "Type being processed was " + type + ", will not be included in the results: " + e.getMessage() + ".\n"
+ "Last tried query was \n" + this.getQueryAType(type));
}
}
} catch(Exception e) {
logger.error("FATAL unexpected error, only " + processed + " types where actually processed. Type being processed was " + type + ".\n"
+ "Last tried query was " + this.getQueryAType(type));
OutputStream output = new FileOutputStream(this.output + ".temp");
this.model.write(output, "RDF/XML-ABBREV");
logger.info("Processed so far " + processed + " types, from which only " + toLinkset + " are suitable for linksets.");
throw e;
}
logger.info("All types processed, a total of " + processed + ", from which only " + toLinkset + " are suitable for linksets.");
processed++;
System.out.println("Processed: " + processed);
if (processed % LinksetsExtraction.LIMIT == 0) {
OutputStream output = new FileOutputStream(this.output + ".temp");
this.model.write(output, "RDF/XML-ABBREV");
logger.info("Processed so far " + processed + " types, "
+ "from which only " + processedSubjects + " are suitable for linksets. "
+ "Triples to be converted in linksets so far " + processedTriples);
}
}
logger.info("All types processed, a total of " + processed + ", "
+ "from which only " + processedSubjects + " are suitable for linksets. "
+ "Triples converted in linksets " + processedTriples);
return processed;
}

Expand All @@ -238,15 +250,18 @@ private int retrieveOneByOne() throws Exception {
*/
private void addLinksetToModel(String sub, String pred, String obj) {
//subject class partition
String subId = sub.replaceAll("[^a-zA-Z0-9]", "_");
Resource subRes = model.createResource(sub);
Resource subResNode = model.createResource().addProperty(this.classProp, subRes);
Resource subResNode = model.createResource(new AnonId(subId)).addProperty(this.classProp, subRes);
dataset.addProperty(this.classPartitionProp, subResNode);
//object class partition
String objId = obj.replaceAll("[^a-zA-Z0-9]", "_");
Resource objRes = model.createResource(obj);
Resource objResNode = model.createResource().addProperty(this.classProp, objRes);
Resource objResNode = model.createResource(new AnonId(objId)).addProperty(this.classProp, objRes);
dataset.addProperty(this.classPartitionProp, objResNode);
//link predicate
Resource linkset = model.createResource().addProperty(this.type, model.createResource(VoidVocabulary.CLASS_LINKSET.getEntityURI()));
String linksetId = subId + "_" + pred.replaceAll("[^a-zA-Z0-9]", "_") + "_" + objId;
Resource linkset = model.createResource(new AnonId(linksetId)).addProperty(this.type, model.createResource(VoidVocabulary.CLASS_LINKSET.getEntityURI()));
linkset.addProperty(this.linkPredicateProp, model.createResource(pred));
linkset.addProperty(this.subjectsTargetProp, subResNode);
linkset.addProperty(this.objectsTargetProp, objResNode);
Expand All @@ -255,9 +270,10 @@ private void addLinksetToModel(String sub, String pred, String obj) {

/**
* Extracts the types and creates the linksets for a dataset.
* @throws FileNotFoundException
* @throws Exception
*/
public void extract() throws Exception {
public void extract() throws FileNotFoundException {
logger.info("Total number of types in the dataset: " + this.getTypesNumber() + ". Process starting at " + this.offset);
logger.info("A temp file will be written every " + LinksetsExtraction.LIMIT + " types");
QueryEngineHTTP httpQuery = this.retrieveAllTypes();
Expand Down
11 changes: 8 additions & 3 deletions src/main/java/uk/ac/ebi/rdf/linksets/LinksetsGeneration.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,16 @@ public static void main(String[] args) throws Exception {
+ "-w <working directory> directory where RDF output will be saved\n"
+ "-ep <endpoint URI> endpoint for which you are generating linksets\n"
+ "-ds <dataset URI> dataset URI\n"
+ "-o <number> offset to process types (in case process drops in the middle)";
+ "-o <number> offset to process types (in case process drops in the middle)\n"
+ "-l <number> limit to process types from offset, if offset is 0 limit will be discarded.";
if (args == null) {
System.out.println(usage);
System.exit(0);
}
PropertyConfigurator.configure("log4j.properties");

String runningMode = null, workingDir = null, endpoint = null, dataset = null;
int offset = 0;
int offset = 0, limit = 0;
//boolean addIssuedDate = false;
for (int i = 0; i < args.length; i++) {
String str = args[i];
Expand All @@ -57,6 +58,10 @@ public static void main(String[] args) throws Exception {
try {
offset = Integer.parseInt(args[++i]);
} catch (NumberFormatException nfe) {}
} else if (str.equalsIgnoreCase("-l")) {
try {
limit = Integer.parseInt(args[++i]);
} catch (NumberFormatException nfe) {}
}
}

Expand All @@ -82,7 +87,7 @@ public static void main(String[] args) throws Exception {
if ((endpoint != null) && (dataset != null)) {
outputFileName = workingDir + "/" + ModeAndFile.EXTRACT.getFileName();
try {
LinksetsExtraction linksets = new LinksetsExtraction(dataset, endpoint, outputFileName, offset);
LinksetsExtraction linksets = new LinksetsExtraction(dataset, endpoint, outputFileName, offset, limit);
linksets.extract();
} catch (IOException e) {
logger.error("Unxpected error while extracting classes and properties from " + endpoint + "\nCheck the output file at " + outputFileName);
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/config.properties
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
###Configuration properties

#Comma separated list of types that should be exclude from the solution. The list contains the starts_with string for the types URI
types.exclude=http://www.ifomis.org/,http://www.w3.org/2002/,http://www.w3.org/ns/sparql-service-description/,http://rdfs.org/ns/void/
types.exclude=http://www.ifomis.org/,http://www.w3.org/2002/,http://www.w3.org/ns/sparql-service-description,http://rdfs.org/ns/void

#Types that should be merged into one in the form of regex, matched fields.
#The replacement might include portions of the regex evaluation --$0,$1,and so on
Expand Down

0 comments on commit 089f1ec

Please sign in to comment.