/**
*
*/
package com.maalaang.omtwitter.ontology;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.log4j.Logger;
import com.hp.hpl.jena.query.Query;
import com.hp.hpl.jena.query.QueryExecution;
import com.hp.hpl.jena.query.QueryExecutionFactory;
import com.hp.hpl.jena.query.QueryFactory;
import com.hp.hpl.jena.query.QuerySolution;
import com.hp.hpl.jena.query.ResultSet;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
/**
* @author Sangwon Park
*
*/
public class DBPediaDomainOntologyBuilder {
private Logger logger = null;
private final int LITERAL_KEY_MAX_LEN = 20;
public DBPediaDomainOntologyBuilder() {
logger = Logger.getLogger(getClass().getName());
}
/**
* Collects statements about the resources and calculate the frequency of each predicate-object pattern.
* @param resources a set of resources to find predicate-object patterns of their statements
* @param interval milliseconds for the time interval between queries to DBPedia
* @return a map for predicate-object patterns and their frequencies
* @throws InterruptedException
*/
public Map<String,Integer> stmtPatternFrequency(Set<String> resources, int interval) throws InterruptedException {
HashMap<String,Integer> map = new HashMap<String, Integer>();
for (String id : resources) {
String q = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " +
"SELECT ?p ?o WHERE {" +
" <" + id + "> ?p ?o " +
"}";
logger.info("find statement about <" + id + "> - query to dbpedia: " + q);
Query query = QueryFactory.create(q);
QueryExecution qexec = QueryExecutionFactory.sparqlService(DBPediaConstant.DBPEDIA_SPARQL_ENDPOINT, query);
ResultSet results = qexec.execSelect();
while (results.hasNext()) {
try {
QuerySolution qs = results.next();
Resource p = qs.getResource("p");
RDFNode o = qs.get("o");
String oKey = null;
if (o.isResource()) {
oKey = o.asResource().getURI();
} else {
oKey = o.asLiteral().getValue().toString().replaceAll("\\s+", " ");
if (oKey.length() > LITERAL_KEY_MAX_LEN) {
oKey = oKey.substring(0, LITERAL_KEY_MAX_LEN);
}
}
String stmt = p.getURI() + " " + oKey;
logger.debug(stmt);
Integer freq = map.get(stmt);
if (freq != null) {
map.put(stmt, ++freq);
} else {
map.put(stmt, 1);
}
} catch (Exception e) {
logger.error(e);
}
}
qexec.close() ;
if (interval > 0) {
logger.info("wait for " + (interval / 1000) + " seconds");
Thread.sleep(interval);
}
}
return map;
}
/**
* Find statements matched with the predicate-object pattern by querying to DBPedia SparQL end-point and add them to the resource set.
* @param stmtPatternFreqMap
* @param resources
* @param paramSeedCommonness
* @param targetProperties
* @param interval
* @throws InterruptedException
*/
public void expandResources(Map<String,Integer> stmtPatternFreqMap, Set<String> resources, double paramSeedCommonness, Set<String> targetProperties, int interval) throws InterruptedException {
int threshold = (int) Math.ceil(resources.size() * paramSeedCommonness);
logger.info("expand resources - pattern_frequency_threshold=" + threshold);
Set<Entry<String,Integer>> entrySet = stmtPatternFreqMap.entrySet();
for (Entry<String,Integer> entry : entrySet) {
if (entry.getValue() >= threshold) {
String stmtStr = entry.getKey();
int idx = stmtStr.indexOf(' ');
String prop = stmtStr.substring(0, idx);
String obj = stmtStr.substring(idx+1);
if (!obj.startsWith(DBPediaConstant.DBPEDIA_RESOURCE_URI_PREFIX))
continue;
if (targetProperties != null) {
if (targetProperties.contains(prop)) {
addMatchedResources(resources, prop, obj);
Thread.sleep(interval);
}
} else {
addMatchedResources(resources, prop, obj);
Thread.sleep(interval);
}
}
}
logger.info("total " + resources.size() + " resources in the resource set");
}
private void addMatchedResources(Set<String> resources, String p, String o) {
logger.info("find resources matched with the pattern - <s> < " + p + "> <" + o + ">");
String q = "SELECT ?s WHERE { " +
" ?s <" + p + "> <" + o + "> " +
"}";
logger.info("query to dbpedia: " + q);
Query query = QueryFactory.create(q);
QueryExecution qexec = QueryExecutionFactory.sparqlService(DBPediaConstant.DBPEDIA_SPARQL_ENDPOINT, query);
ResultSet results = qexec.execSelect();
int cnt = 0;
while (results.hasNext()) {
QuerySolution qs = results.next();
RDFNode s = qs.get("s");
if (s.isResource()) {
String uri = s.asResource().getURI();
resources.add(uri);
cnt++;
}
}
logger.info(cnt + " matched resources are added");
}
public Model retrieveStmtsForResources(Set<String> resources, int interval) throws IOException, InterruptedException {
Model model = ModelFactory.createDefaultModel();
int totalCnt = 0;
for (String uri : resources) {
Resource s = model.createResource(uri);
String q = "SELECT ?p ?o WHERE { " +
" <" + uri + "> ?p ?o . " +
"}";
logger.info("query to dbpedia - " + q);
Query query = QueryFactory.create(q);
QueryExecution qexec = QueryExecutionFactory.sparqlService(DBPediaConstant.DBPEDIA_SPARQL_ENDPOINT, query);
ResultSet results = null;
while (results == null) {
try {
results = qexec.execSelect();
} catch (Exception e) {
logger.error(e);
logger.info("wait for " + (interval / 1000) + " seconds");
Thread.sleep(interval);
}
}
int cnt = 0;
while (results.hasNext()) {
QuerySolution qs = results.next();
Resource propRes = qs.getResource("p");
Property prop = model.createProperty(propRes.getURI());
model.add(s, prop, qs.get("o"));
cnt++;
}
totalCnt += cnt;
logger.info(cnt + " triples were added");
logger.info("wait for " + (interval / 1000) + " seconds");
Thread.sleep(interval);
}
logger.info("total " + totalCnt + " triples were added");
return model;
}
}