package org.aksw.gerbil.dataset.impl.erd;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import com.hp.hpl.jena.query.Query;
import com.hp.hpl.jena.query.QueryExecution;
import com.hp.hpl.jena.query.QueryExecutionFactory;
import com.hp.hpl.jena.query.QueryFactory;
public class ERDDataset2 extends AbstractDataset implements
InitializableDataset {
private List<Document> documents;
private String annotateFile;
private String textFile;
private String queryTemp = "PREFIX owl:<http://www.w3.org/2002/07/owl#> PREFIX freebase:<http://rdf.freebase.com/ns/> SELECT ?s WHERE {?s owl:sameAs freebase:%%v%%}";
private static final String DBPEDIA_SERVICE = "http://dbpedia.org/sparql";
public ERDDataset2(String textFile, String annotateFile) {
this.annotateFile = annotateFile;
this.textFile = textFile;
}
@Override
public int size() {
return documents.size();
}
@Override
public List<Document> getInstances() {
return documents;
}
@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(annotateFile), new File(
textFile));
}
private List<Document> loadDocuments(File annFile, File textFile) throws GerbilException {
List<Document> documents = new ArrayList<Document>();
String documentUriPrefix = "http://" + getName() + "/";
try (BufferedReader breader = new BufferedReader(new InputStreamReader(
new FileInputStream(textFile), Charset.forName("UTF-8")))) {
String line;
List<Marking> markings = null;
while ((line = breader.readLine()) != null) {
if(line.isEmpty()){
continue;
}
String[] text = line.split("\t");
markings = findMarkings(text, annFile);
documents.add(new DocumentImpl(text[1], documentUriPrefix
+ text[0], markings));
}
} catch (IOException e) {
throw new GerbilException("Exception while reading dataset.", e,
ErrorTypes.DATASET_LOADING_ERROR);
}
return documents;
}
private List<Marking> findMarkings(String[] text, File annFile) throws GerbilException {
List<Marking> markings = new ArrayList<Marking>();
try (BufferedReader breader = new BufferedReader(new InputStreamReader(
new FileInputStream(annFile), Charset.forName("UTF-8")))) {
String line;
while ((line = breader.readLine()) != null) {
if(line.isEmpty()){
continue;
}
String[] annotation = line.split("\t");
int searchID = getTrecID(text[0]);
int annoID = getTrecID(annotation[0]);
if(searchID == annoID){
int start = text[1].indexOf(annotation[3]);
int length = annotation[3].length();
//FIXME time consuming!
String freebaseID = annotation[2].substring(1, annotation[2].length()).replace("/",".");
Query query = QueryFactory.create(queryTemp.replace("%%v%%", freebaseID));
QueryExecution qexec = QueryExecutionFactory.createServiceRequest(DBPEDIA_SERVICE, query);
String uri = qexec.execSelect().next().getResource("s").getURI();
markings.add(new NamedEntity(start, length, uri));
}
else if(annoID > searchID){
//There is no annotation for the given text
break;
}
}
} catch (IOException e) {
throw new GerbilException("Exception while reading dataset.", e,
ErrorTypes.DATASET_LOADING_ERROR);
}
return markings;
}
private int getTrecID(String trec){
return Integer.valueOf(trec.replace("TREC-", ""));
}
}