package org.aksw.gerbil.tools;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.semantic.sameas.index.Indexer;
import org.apache.commons.lang.time.DurationFormatUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.hp.hpl.jena.query.Query;
import com.hp.hpl.jena.query.QueryExecution;
import com.hp.hpl.jena.query.QueryExecutionFactory;
import com.hp.hpl.jena.query.QueryFactory;
import com.hp.hpl.jena.query.QuerySolution;
import com.hp.hpl.jena.query.ResultSet;
import com.hp.hpl.jena.rdf.model.RDFNode;
public class InitialIndexTool {
private static final Logger LOGGER = LoggerFactory
.getLogger(InitialIndexTool.class);
private static final String OUTPUT_FOLDER = "lucene_index";
private static final String SPARQL_GET = "select distinct ?s ?o where {?s <http://www.w3.org/2002/07/owl#sameAs> ?o}";
private static String service = "http://de.dbpedia.org/sparql";
private static Object owlSameAs="<http://www.w3.org/2002/07/owl#sameAs>";
public static void main(String[] args) throws GerbilException, IOException {
Indexer index = new Indexer(OUTPUT_FOLDER);
SimpleDateFormat format = new SimpleDateFormat();
Date start = Calendar.getInstance().getTime();
LOGGER.info("Start indexing at {}", format.format(start));
indexFolder(index, args[0]);
index.close();
Date end = Calendar.getInstance().getTime();
LOGGER.info("Indexing finished at {}", format.format(end));
LOGGER.info("Indexing took: "
+ DurationFormatUtils.formatDurationHMS(end.getTime()
- start.getTime()));
}
public static void index(Indexer index) throws GerbilException {
int offset = 0, limit = 10000;
boolean test = true;
Query q = QueryFactory.create(SPARQL_GET);
q.setLimit(limit);
// Create here!
Set<String> sameAsBlock = new HashSet<String>();
RDFNode old = null;
int rounds = 0, size = 0;
long total = 0;
Date start = Calendar.getInstance().getTime();
do {
q.setOffset(offset);
Date startQ = Calendar.getInstance().getTime();
QueryExecution qexec = QueryExecutionFactory.sparqlService(service,
q);
ResultSet res = qexec.execSelect();
Date endQ = Calendar.getInstance().getTime();
// get results
size = 0;
long sumI = 0;
rounds++;
// Go through all elements
while (res.hasNext()) {
size++;
QuerySolution solution = res.next();
RDFNode node1 = solution.get("s");
RDFNode node2 = solution.get("o");
if (node1.equals(old)) {
sameAsBlock.add(node2.toString());
} else if (old != null) {
// Enitity is finished
Date startI = Calendar.getInstance().getTime();
index.index(old.toString(), sameAsBlock);
Date endI = Calendar.getInstance().getTime();
sumI += endI.getTime() - startI.getTime();
total += sameAsBlock.size();
sameAsBlock.clear();
// Add Uri
sameAsBlock.add(node2.toString());
old = node1;
} else {
// First run
sameAsBlock.add(node2.toString());
old = node1;
}
}
if (size < limit) {
// No more results
test = false;
}
// Set offset so it starts immediately after last results
offset += limit;
Date end = Calendar.getInstance().getTime();
String avg = DurationFormatUtils
.formatDurationHMS((end.getTime() - start.getTime())
/ rounds);
String avgQ = DurationFormatUtils
.formatDurationHMS((endQ.getTime() - startQ.getTime()));
String avgI = DurationFormatUtils.formatDurationHMS(sumI);
sumI = 0;
LOGGER.info(
"Got {} triples...(Sum: {}, AvgTime: {}, QueryTime: {}, IndexTime: {})",
size, limit * (rounds - 1) + size, avg, avgQ, avgI);
} while (test);
// done
if (!sameAsBlock.isEmpty()) {
index.index(old.toString(), sameAsBlock);
sameAsBlock.clear();
}
LOGGER.info("Successfully indexed {} triples", total);
}
public static void indexFolder(Indexer index, String folder) throws GerbilException, IOException{
File dir = new File(folder);
for(File f : dir.listFiles()){
if(f.getName().endsWith(".nt"))
index(index, f.getAbsolutePath());
}
}
public static void index(Indexer index, String file)
throws GerbilException, IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(file), Charset.forName("UTF-8")));
// Create here!
Set<String> sameAsBlock = new HashSet<String>();
long total = 0, size=0, rounds=0;
String line = "";
String old = null;
Date start = Calendar.getInstance().getTime();
while ((line = reader.readLine()) != null) {
String[] split = line.split("\\s+");
if(!split[1].equals(owlSameAs)) {
continue;
}
String node1 = split[0].replace("<", "").replace(">", "");
String node2 = split[2];
node2 = node2.substring(node2.indexOf("<")+1, node2.lastIndexOf(">")).trim();
if (node1.equals(old)) {
sameAsBlock.add(node2.toString());
} else if (old != null) {
// Enitity is finished
index.index(old.toString(), sameAsBlock);
total += sameAsBlock.size();
sameAsBlock.clear();
// Add Uri
sameAsBlock.add(node2.toString());
old = node1;
} else {
// First run
sameAsBlock.add(node2.toString());
old = node1;
}
size++;
if(size%100000==0){
Date end = Calendar.getInstance().getTime();
rounds++;
String avgTime =DurationFormatUtils.formatDurationHMS((end.getTime()
- start.getTime())/rounds);
LOGGER.info("Got 100000 triples...(Sum: {}, AvgTime: {})", size, avgTime);
}
}
// done
if (!sameAsBlock.isEmpty()) {
index.index(old.toString(), sameAsBlock);
sameAsBlock.clear();
}
reader.close();
LOGGER.info("Successfully indexed {} triples", total);
}
}