package doser.tools.indexcreation;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.jgrapht.UndirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.graph.SimpleGraph;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdtjena.HDTGraph;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import com.hp.hpl.jena.query.QueryException;
import com.hp.hpl.jena.query.QueryExecution;
import com.hp.hpl.jena.query.QueryExecutionFactory;
import com.hp.hpl.jena.query.QueryFactory;
import com.hp.hpl.jena.query.QueryParseException;
import com.hp.hpl.jena.query.QuerySolution;
import com.hp.hpl.jena.query.ResultSet;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import doser.lucene.analysis.DoserIDAnalyzer;
public class CreateDBpediaIndexV2 {
public static final String SURFACEFORMDIRECTORY = "/home/zwicklbauer/surfaceforms";
public static final String SURFACEFORMDIRECTORYN3 = "/home/zwicklbauer/surfaceforms/n3/";
public static final String OLDINDEX = "/mnt/ssd1/disambiguation/MMapLuceneIndexStandard/";
public static final String NEWINDEX = "/home/zwicklbauer/NewIndexTryout";
public static final String ENTITYLIST = "/home/zwicklbauer/WikipediaEntities/entityList_Default_HoffartNew.dat";
public static final String MAPPINGPROPERTIES = "/home/zwicklbauer/HDTGeneration/mappingbased_properties_cleaned_en.nt";
public static final String PERSONDATAHDT = "/mnt/ssd1/disambiguation/HDT/persondata_en.hdt";
public static final String INFOBOXPROPERTIES = "/home/zwicklbauer/HDTGeneration/infobox_properties_en.nt";
public static final String DISAMBIGUATIONWIKILINKS = "/home/zwicklbauer/HDTGeneration/disambiguations_en.nt";
public static final String PATTYWIKIPATTERN = "/home/zwicklbauer/Patty/patty-dataset-WikiTypes/wikipedia-patterns.txt";
public static final String PATTYWIKIINSTANCE = "/home/zwicklbauer/Patty/patty-dataset-WikiTypes/wikipedia-instances.txt";
public static final String PATTYFREEBASEPATTERN = "/home/zwicklbauer/Patty/patty-dataset-freebase/wikipedia-patterns.txt";
public static final String PATTYFREEBASEINSTANCE = "/home/zwicklbauer/Patty/patty-dataset-freebase/wikipedia-instances.txt";
public static final String EVIDENCEFILE = "/home/zwicklbauer/word2vec/evidences.dat";
public static final String WEBOCCURRENCESDIRECTORY = "/home/zwicklbauer/WikipediaEntities/EntitiesWebContext/";
public static final String LINKTEXT = "/home/zwicklbauer/WikipediaEntities/enwiki-latest/linktext";
public static final String ENTITIES = "/home/zwicklbauer/WikipediaEntities/entities_StandardParse_threshold12";
public static final String REDIRECTS = "/home/zwicklbauer/WikipediaEntities/enwiki-latest/redirects";
public static final String ARTICLECATEGORIES = "/home/zwicklbauer/HDTGeneration/article_categories_en.nt";
public static final String LABELHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/labels_en.hdt";
public static final String SHORTDESCHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/short_abstracts_en.hdt";
public static final String LONGDESCHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/long_abstracts_en.hdt";
public static final String INSTANCEMAPPINGTYPES = "/mnt/ssd1/disambiguation/HDT/instance_types_en.hdt";
public static final String INSTANCEMAPPINGTYPES_NT = "/mnt/ssd1/disambiguation/HDT/instance_types_en.nt";
public static final String SKOSBROADER = "/home/zwicklbauer/HDTGeneration/skos_categories_en.nt";
public static final String EXTERNSFDIRECTORY = "/home/zwicklbauer/SurfaceForms/";
private HashMap<String, HashSet<String>> LABELS;
private HashSet<String> entities;
private HashMap<String, LinkedList<String>> relationmap;
private HashMap<String, LinkedList<String>> pattymap;
private HashMap<String, LinkedList<String>> pattyfreebasemap;
private HashMap<String, String> evidences;
private HashSet<String> teams;
private HashMap<String, HashSet<String>> UNIQUELABELSTRINGS;
private HashMap<String, HashMap<String, Integer>> OCCURRENCES;
private HashMap<String, Integer> DBPEDIAGRAPHINLINKS;
private HashMap<String, String> urlentitymapping;
private int counter;
private Model labelmodel;
private Model shortdescmodel;
private Model longdescmodel;
private Model persondata;
private Model instancemappingtypes;
public CreateDBpediaIndexV2() {
super();
this.relationmap = new HashMap<String, LinkedList<String>>();
this.pattymap = new HashMap<String, LinkedList<String>>();
this.pattyfreebasemap = new HashMap<String, LinkedList<String>>();
this.OCCURRENCES = new HashMap<String, HashMap<String, Integer>>();
this.LABELS = new HashMap<String, HashSet<String>>();
this.UNIQUELABELSTRINGS = new HashMap<String, HashSet<String>>();
this.DBPEDIAGRAPHINLINKS = new HashMap<String, Integer>();
this.evidences = new HashMap<String, String>();
this.teams = new HashSet<String>();
this.urlentitymapping = new HashMap<String, String>();
this.entities = new HashSet<String>();
this.counter = 0;
HDT labelhdt;
HDT shortdeschdt;
HDT longdeschdt;
HDT mappingbasedproperties;
HDT instancemappingtypeshdt;
try {
labelhdt = HDTManager.mapIndexedHDT(LABELHDT, null);
shortdeschdt = HDTManager.mapIndexedHDT(SHORTDESCHDT, null);
longdeschdt = HDTManager.mapIndexedHDT(LONGDESCHDT, null);
mappingbasedproperties = HDTManager.mapIndexedHDT(PERSONDATAHDT, null);
instancemappingtypeshdt = HDTManager.mapIndexedHDT(INSTANCEMAPPINGTYPES, null);
final HDTGraph labelhdtgraph = new HDTGraph(labelhdt);
final HDTGraph shortdeschdtgraph = new HDTGraph(shortdeschdt);
final HDTGraph longdeschdtgraph = new HDTGraph(longdeschdt);
final HDTGraph instancepersondata = new HDTGraph(mappingbasedproperties);
final HDTGraph instancemappingtypesgraph = new HDTGraph(instancemappingtypeshdt);
this.labelmodel = ModelFactory.createModelForGraph(labelhdtgraph);
this.shortdescmodel = ModelFactory.createModelForGraph(shortdeschdtgraph);
this.longdescmodel = ModelFactory.createModelForGraph(longdeschdtgraph);
this.persondata = ModelFactory.createModelForGraph(instancepersondata);
this.instancemappingtypes = ModelFactory.createModelForGraph(instancemappingtypesgraph);
} catch (IOException e) {
e.printStackTrace();
}
}
public void loadEvidences() {
File file = new File(EVIDENCEFILE);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String line = null;
while ((line = reader.readLine()) != null) {
String splitter[] = line.split("\\t");
this.evidences.put(splitter[0], splitter[1]);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void createDBpediaPriors() {
UndirectedGraph<String, DefaultEdge> graph = new SimpleGraph<String, DefaultEdge>(DefaultEdge.class);
Model m = ModelFactory.createDefaultModel();
m.read(INFOBOXPROPERTIES);
StmtIterator it = m.listStatements();
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) {
if (!subject.getURI().equalsIgnoreCase(obj.getURI())) {
graph.addVertex(subject.getURI());
graph.addVertex(obj.getURI());
graph.addEdge(subject.getURI(), obj.getURI());
}
}
}
counter++;
}
m = ModelFactory.createDefaultModel();
m.read(MAPPINGPROPERTIES);
it = m.listStatements();
counter = 0;
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) {
if (!subject.getURI().equalsIgnoreCase(obj.getURI())) {
graph.addVertex(subject.getURI());
graph.addVertex(obj.getURI());
graph.addEdge(subject.getURI(), obj.getURI());
}
}
}
}
m = ModelFactory.createDefaultModel();
m.read(SKOSBROADER);
it = m.listStatements();
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) {
if (!subject.getURI().equalsIgnoreCase(obj.getURI())) {
graph.addVertex(subject.getURI());
graph.addVertex(obj.getURI());
graph.addEdge(subject.getURI(), obj.getURI());
}
}
}
}
m = ModelFactory.createDefaultModel();
m.read(ARTICLECATEGORIES);
it = m.listStatements();
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) {
if (!subject.getURI().equalsIgnoreCase(obj.getURI())) {
graph.addVertex(subject.getURI());
graph.addVertex(obj.getURI());
graph.addEdge(subject.getURI(), obj.getURI());
}
}
}
}
Set<String> set = graph.vertexSet();
for (String s : set) {
DBPEDIAGRAPHINLINKS.put(s, graph.degreeOf(s));
}
}
public void fillPropertiesIndex() {
Model m = ModelFactory.createDefaultModel();
m.read(INFOBOXPROPERTIES);
StmtIterator it = m.listStatements();
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) {
if (!relationmap.containsKey(subject.getURI())) {
LinkedList<String> list = new LinkedList<String>();
relationmap.put(subject.getURI(), list);
}
LinkedList<String> l = relationmap.get(subject.getURI());
l.add(pra.getURI().replaceAll("http://dbpedia.org/property/", "dbpediaOnt/") + ":::"
+ obj.getURI().replaceAll("http://dbpedia.org/resource/", "dbpediaRes/"));
}
}
}
}
public void fillRelationsIndex() {
Model m = ModelFactory.createDefaultModel();
m.read(MAPPINGPROPERTIES);
StmtIterator it = m.listStatements();
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) {
if (!relationmap.containsKey(subject.getURI())) {
LinkedList<String> list = new LinkedList<String>();
relationmap.put(subject.getURI(), list);
}
LinkedList<String> l = relationmap.get(subject.getURI());
l.add(pra.getURI().replaceAll("http://dbpedia.org/ontology/", "dbpediaOnt/") + ":::"
+ obj.getURI().replaceAll("http://dbpedia.org/resource/", "dbpediaRes/"));
}
}
}
}
public void fillPattyRelationIndex(String pattern, String instance) {
File patternFile = new File(pattern);
HashMap<Integer, String> patternMap = new HashMap<Integer, String>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(patternFile));
reader.readLine();
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
Integer i = null;
try {
i = new Integer(Integer.valueOf(splitter[0]));
} catch (NumberFormatException e) {
e.printStackTrace();
}
patternMap.put(i, splitter[1]);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// Read Instancefile - either WikiTypes or Freebase Types
File instanceFile = new File(instance);
reader = null;
try {
reader = new BufferedReader(new FileReader(instanceFile));
reader.readLine();
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
Integer j = null;
try {
j = new Integer(Integer.valueOf(splitter[0]));
} catch (NumberFormatException e) {
e.printStackTrace();
}
String subject = WikiPediaUriConverter.createConformDBpediaURI(splitter[1]);
String object = WikiPediaUriConverter.createConformDBpediaURI(splitter[2])
.replaceAll("http://dbpedia.org/resource/", "");
if (!pattymap.containsKey(subject)) {
LinkedList<String> list = new LinkedList<String>();
pattymap.put(subject, list);
}
LinkedList<String> l = pattymap.get(subject);
l.add("patty/" + patternMap.get(j) + ":::" + "dbpediaRes/" + object);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void fillPattyFreebaseRelationIndex(String pattern, String instance) {
File patternFile = new File(pattern);
HashMap<Integer, String> patternMap = new HashMap<Integer, String>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(patternFile));
reader.readLine();
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
Integer i = null;
try {
i = new Integer(Integer.valueOf(splitter[0]));
} catch (NumberFormatException e) {
e.printStackTrace();
}
patternMap.put(i, splitter[1]);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// Read Instancefile - either WikiTypes or Freebase Types
File instanceFile = new File(instance);
reader = null;
try {
reader = new BufferedReader(new FileReader(instanceFile));
reader.readLine();
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
Integer j = null;
try {
j = new Integer(Integer.valueOf(splitter[0]));
} catch (NumberFormatException e) {
e.printStackTrace();
}
String subject = WikiPediaUriConverter.createConformDBpediaURI(splitter[1]);
String object = WikiPediaUriConverter.createConformDBpediaURI(splitter[2])
.replaceAll("http://dbpedia.org/resource/", "");
if (!pattyfreebasemap.containsKey(subject)) {
LinkedList<String> list = new LinkedList<String>();
pattyfreebasemap.put(subject, list);
}
LinkedList<String> l = pattyfreebasemap.get(subject);
l.add("patty/" + patternMap.get(j) + ":::" + "dbpediaRes/" + object);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void workLinkText() {
File f = new File(LINKTEXT);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(f));
String line = null;
while ((line = reader.readLine()) != null) {
String split[] = line.split("\\t");
if (split.length > 2) {
for (int i = 2; i < split.length; ++i) {
String ent = split[i];
String[] occ = ent.split(":");
// Bugfix: Fix Wrong splitting
StringBuilder builder = new StringBuilder();
for (int j = 0; j < occ.length - 1; j++) {
builder.append(occ[j] + ":");
}
String nr = occ[occ.length - 1];
String entity = builder.toString();
entity = entity.substring(0, entity.length() - 1);
String uri = WikiPediaUriConverter.createConformDBpediaURI(entity);
if (!uri.contains("(Disambiguation)")) {
// UniqueLabelStrings
if (UNIQUELABELSTRINGS.containsKey(uri)) {
HashSet<String> set = UNIQUELABELSTRINGS.get(uri);
set.add(split[0].toLowerCase());
addUniqueCandidateWithoutSpecialChars(set, split[0]);
} else {
HashSet<String> set = new HashSet<String>();
set.add(split[0].toLowerCase());
addUniqueCandidateWithoutSpecialChars(set, split[0]);
UNIQUELABELSTRINGS.put(uri, set);
}
// Occurrences
if (!OCCURRENCES.containsKey(uri)) {
HashMap<String, Integer> map = new HashMap<String, Integer>();
OCCURRENCES.put(uri, map);
}
addOccurrence(uri, split[0].toLowerCase(), Integer.valueOf(nr));
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void getUniqueLabelsFromOldIndex() {
File oldIndexFile = new File(OLDINDEX);
IndexReader readerOldIndex = null;
try {
final Directory oldDir = FSDirectory.open(oldIndexFile);
readerOldIndex = DirectoryReader.open(oldDir);
for (int j = 0; j < readerOldIndex.maxDoc(); ++j) {
Document oldDoc = readerOldIndex.document(j);
String[] oldUniqueLabels = oldDoc.getValues("UniqueLabelString");
String oldResource = oldDoc.get("Mainlink");
// Transform old to new Namespace
oldResource = oldResource.replaceAll("http://dbpedia.org/resource/", "");
oldResource = URLDecoder.decode(oldResource, "UTF-8");
oldResource = WikiPediaUriConverter.createConformDBpediaURI(oldResource);
// Old Unique Labels
if (UNIQUELABELSTRINGS.containsKey(oldResource)) {
HashSet<String> set = UNIQUELABELSTRINGS.get(oldResource);
if (oldUniqueLabels != null && oldUniqueLabels.length > 0) {
for (int k = 0; k < oldUniqueLabels.length; ++k) {
set.add(oldUniqueLabels[k].toLowerCase());
addUniqueCandidateWithoutSpecialChars(set, oldUniqueLabels[k]);
}
}
} else {
HashSet<String> set = new HashSet<String>();
if (oldUniqueLabels != null && oldUniqueLabels.length > 0) {
for (int k = 0; k < oldUniqueLabels.length; ++k) {
set.add(oldUniqueLabels[k].toLowerCase());
addUniqueCandidateWithoutSpecialChars(set, oldUniqueLabels[k]);
}
}
UNIQUELABELSTRINGS.put(oldResource, set);
}
if (!OCCURRENCES.containsKey(oldResource)) {
OCCURRENCES.put(oldResource, new HashMap<String, Integer>());
}
String oldOccurrences = oldDoc.get("Occurrences");
if ((oldOccurrences != null) && !oldOccurrences.equalsIgnoreCase("")) {
final String[] splitter = oldOccurrences.split(";;;");
for (final String element : splitter) {
final String[] splitter1 = element.split(":::");
int check = 1;
try {
check = Integer.valueOf(splitter1[1]);
} catch (final NumberFormatException e) {
Logger.getRootLogger().error("Warning NumberFormatException while Initialization: ");
}
addOccurrence(oldResource, splitter1[0], check);
}
}
}
readerOldIndex.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (readerOldIndex != null) {
try {
readerOldIndex.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void workEntities() {
File f = new File(ENTITIES);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(f));
String line = null;
while ((line = reader.readLine()) != null) {
String split[] = line.split("\\t");
if (split.length > 2) {
for (int i = 2; i < split.length; ++i) {
String ent = split[i];
String[] occ = ent.split(":");
String uri = WikiPediaUriConverter.createConformDBpediaURI(occ[0]);
// Synonyms
if (LABELS.containsKey(uri)) {
HashSet<String> set = LABELS.get(uri);
// Add Label to UniqueLabel
addLabelToUniqueLabel(uri, split[0]);
set.add(split[0].toLowerCase());
} else {
HashSet<String> set = new HashSet<String>();
// set.add(occ[0].toLowerCase());
set.add(split[0].toLowerCase());
// Add Label to UniqueLabel
addLabelToUniqueLabel(uri, split[0]);
LABELS.put(uri, set);
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private void addLabelToUniqueLabel(String entity, String label) {
if (UNIQUELABELSTRINGS.containsKey(entity)) {
HashSet<String> set = UNIQUELABELSTRINGS.get(entity);
set.add(label.toLowerCase());
addUniqueCandidateWithoutSpecialChars(set, label);
}
}
public void workRedirects() {
File f = new File(REDIRECTS);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(f));
String line = null;
while ((line = reader.readLine()) != null) {
String split[] = line.split("\\t");
// Bug Fix of wrong redirects
if (split.length < 3) {
String uri = WikiPediaUriConverter.createConformDBpediaURI(split[1]);
if (LABELS.containsKey(uri)) {
HashSet<String> set = LABELS.get(uri);
set.add(split[0].toLowerCase());
// Add Label to UniqueLabel
addLabelToUniqueLabel(uri, split[0]);
} else {
HashSet<String> set = new HashSet<String>();
set.add(split[0].toLowerCase());
// Add Label to UniqueLabel
addLabelToUniqueLabel(uri, split[0]);
LABELS.put(uri, set);
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void createNewIndex() {
File newIndexFile = new File(NEWINDEX);
try {
final Directory newDir = FSDirectory.open(newIndexFile);
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
analyzerPerField.put("Label", new DoserIDAnalyzer());
analyzerPerField.put("PattyRelations", new DoserIDAnalyzer());
analyzerPerField.put("PattyFreebaseRelations", new DoserIDAnalyzer());
analyzerPerField.put("Relations", new DoserIDAnalyzer());
analyzerPerField.put("Occurrences", new DoserIDAnalyzer());
analyzerPerField.put("Type", new DoserIDAnalyzer());
analyzerPerField.put("StringLabel", new DoserIDAnalyzer());
PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);
final IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, aWrapper);
final IndexWriter newIndexWriter = new IndexWriter(newDir, config);
for (String uri : entities) {
Document doc = new Document();
// Add ID
doc.add(new StringField("ID", "DBpedia_" + String.valueOf(counter), Store.YES));
counter++;
// Add Mainlink
doc.add(new StringField("Mainlink", uri, Store.YES));
// Add Labels
List<String> origLabels = getDbPediaLabel(uri);
HashSet<String> labelset = LABELS.get(uri);
if (labelset == null) {
labelset = new HashSet<String>();
}
for (String s : origLabels) {
labelset.add(s);
}
for (String s : labelset) {
doc.add(new TextField("Label", s.toLowerCase(), Store.YES));
doc.add(new StringField("StringLabel", s.toLowerCase(), Store.YES));
}
// Add ShortDescriptions
String shortDescription = getDbPediaShortDescription(uri);
doc.add(new TextField("ShortDescription", shortDescription, Store.YES));
// Add longDescriptions
String longDescription = getDbPediaLongDescription(uri);
doc.add(new TextField("LongDescription", longDescription, Store.YES));
// Add Type
String type = filterStandardDomain(getRDFTypesFromEntity(uri));
doc.add(new StringField("Type", type, Store.YES));
// Add Occurrences
HashMap<String, Integer> occs = OCCURRENCES.get(uri);
if (uri.equals("http://dbpedia.org/resource/Real_Madrid_C.F.")) {
occs.put("real", 5000);
}
StringBuilder builder = new StringBuilder();
if (occs != null) {
for (Map.Entry<String, Integer> entry : occs.entrySet()) {
String key = entry.getKey();
int value = entry.getValue();
builder.append(key + ":::" + String.valueOf(value) + ";;;");
}
}
String occurrenceString = builder.toString();
if (occurrenceString.length() > 0) {
occurrenceString = occurrenceString.substring(0, occurrenceString.length() - 3);
}
doc.add(new StringField("Occurrences", occurrenceString, Store.YES));
// UniqueLabelStrings
HashSet<String> keys = UNIQUELABELSTRINGS.get(uri);
// Füge noch die Sportsteams hinzu
if (keys == null) {
keys = new HashSet<String>();
}
if (teams.contains(uri)) {
keys.addAll(extractSportsTeamNames(labelset, uri));
}
// Füge noch weitere Personennamen hinzu
// Flip Person Names Vorname <=> Nachname
if(type.equalsIgnoreCase("Person")) {
keys.addAll(addSomeMorePersonNames(uri));
}
keys.addAll(addAdditionalPersonNameOccurrences(uri));
for (String s : origLabels) {
keys.add(s.toLowerCase());
addUniqueCandidateWithoutSpecialChars(keys, s);
}
for (String s : keys) {
doc.add(new StringField("UniqueLabel", s, Store.YES));
}
// Add DBPedia Facts
if (relationmap.containsKey(uri)) {
LinkedList<String> l = relationmap.get(uri);
builder = new StringBuilder();
if (l != null) {
for (String str : l) {
builder.append(str);
builder.append(";;;");
}
}
String s = builder.toString();
if (s.length() > 0) {
s = s.substring(0, s.length() - 3);
}
doc.add(new TextField("Relations", s, Store.YES));
} else {
doc.add(new TextField("Relations", "", Store.YES));
}
// Add PattyFacts
if (pattymap.containsKey(uri)) {
LinkedList<String> l = pattymap.get(uri);
builder = new StringBuilder();
if (l != null) {
for (String str : l) {
builder.append(str);
builder.append(";;;");
}
}
String s = builder.toString();
if (s.length() > 0) {
s = s.substring(0, s.length() - 3);
}
doc.add(new TextField("PattyRelations", s, Store.YES));
} else {
doc.add(new TextField("PattyRelations", "", Store.YES));
}
// Add PattyFreebaseFacts
if (pattyfreebasemap.containsKey(uri)) {
LinkedList<String> l = pattyfreebasemap.get(uri);
builder = new StringBuilder();
if (l != null) {
for (String str : l) {
builder.append(str);
builder.append(";;;");
}
}
String s = builder.toString();
if (s.length() > 0) {
s = s.substring(0, s.length() - 3);
}
doc.add(new TextField("PattyFreebaseRelations", s, Store.YES));
} else {
doc.add(new TextField("PattyFreebaseRelations", "", Store.YES));
}
// Add DBpediaPriors
if (DBPEDIAGRAPHINLINKS.containsKey(uri)) {
doc.add(new IntField("DbpediaVertexDegree", DBPEDIAGRAPHINLINKS.get(uri), Field.Store.YES));
}
// Add Evidences
// if(evidences.containsKey(uri)) {
// Set<String> ev = extractEvidences(evidences.get(uri));
// for(String s : ev) {
// doc.add(new StringField("Evidence", s, Field.Store.YES));
// }
// }
// Add DBpedia RDFS Label Occurrences
// Set<String> dbpediaoccs = createDBpediaOccs(origLabels);
// for (String s : dbpediaoccs) {
// doc.add(new StringField("DBpediaUniqueLabel", s, Store.YES));
// }
// Write Document To Index
if (doc.get("Label") != null && !doc.get("Label").equalsIgnoreCase("")) {
newIndexWriter.addDocument(doc);
}
}
newIndexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private HashSet<String> addSomeMorePersonNames(final String uri) {
HashSet<String> names = new HashSet<String>();
try {
final String query = "SELECT ?name WHERE{ <" + uri + "> <http://xmlns.com/foaf/0.1/name> ?name. }";
ResultSet results = null;
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query);
qexec = QueryExecutionFactory.create(cquery, this.persondata);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
final String surname = sol.getLiteral("name").getLexicalForm();
names.add(surname.toLowerCase());
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
String reducedUri = uri.replaceAll("http://dbpedia.org/resource/", "");
String splitter[] = reducedUri.split("_");
if(splitter.length == 2) {
names.add((splitter[0]+" "+splitter[1]).toLowerCase());
names.add((splitter[1]+" "+splitter[0]).toLowerCase());
}
return names;
}
// public Set<String> createDBpediaOccs(List<String> labels) {
// Set<String> set = new HashSet<String>();
// for (String s : labels) {
// set.add(s.toLowerCase());
// set.add(s.toLowerCase().replaceAll("[^A-Za-z0-9 ]", ""));
// String[] splitter = s.split(" ");
// if (splitter.length == 2) {
// for (int i = 0; i < splitter.length; i++) {
// splitter[i] = splitter[i].replaceAll("[^A-Za-z0-9 ]", "");
// }
// set.add(splitter[0].toLowerCase());
// set.add(splitter[1].toLowerCase());
// } else if (splitter.length > 2) {
// boolean hasKomma = false;
// int j = -1;
// for (int i = 0; i < splitter.length; i++) {
// if (splitter[i].endsWith(",")) {
// hasKomma = true;
// j = i;
// break;
// }
// }
// if (hasKomma) {
// StringBuilder builder = new StringBuilder();
// StringBuilder withbuilder = new StringBuilder();
// for (int i = 0; i <= j; ++i) {
// builder.append(splitter[i].replaceAll("[^A-Za-z0-9 ]",
// "").toLowerCase());
// withbuilder.append(splitter[i].replaceAll(",", "").toLowerCase());
// if (i < j) {
// builder.append(" ");
// withbuilder.append(" ");
// }
// }
// set.add(builder.toString());
// set.add(withbuilder.toString());
// builder = new StringBuilder();
// withbuilder = new StringBuilder();
// for(int i = j + 1; i < splitter.length; ++i) {
// builder.append(splitter[i].replaceAll("[^A-Za-z0-9 ]",
// "").toLowerCase());
// withbuilder.append(splitter[i].replaceAll(",", "").toLowerCase());
// System.out.println(i+" "+splitter.length);
// if(i < splitter.length - 1) {
// System.out.println("JHUUUU"+builder.toString());
// builder.append(" ");
// withbuilder.append(" ");
// }
// }
// set.add(builder.toString());
// set.add(withbuilder.toString());
// }
// }
//
// // Das erste Wort
//// set.add(splitter[0].toLowerCase());
//// if (splitter.length > 1) {
//// // Das letzte Wort
//// set.add(splitter[splitter.length - 1].toLowerCase());
//// }
// // Abkürzungen
// // StringBuilder builderWith = new StringBuilder();
// // StringBuilder builderWithout = new StringBuilder();
// // for(int i = 0; i < splitter.length; ++i) {
// // builderWith.append(splitter[i].substring(0, 1)+".");
// // builderWithout.append(splitter[i].substring(0, 1));
// // }
// // set.add(builderWith.toString().toLowerCase());
// // if(builderWithout.length() > 1) {
// // set.add(builderWithout.toString().toLowerCase());
// // }
// // N-Gramme
// // NgramIterator ngram = new NgramIterator(2, s);
// // while(ngram.hasNext()) {
// // set.add(ngram.next().toLowerCase());
// // }
// // NgramIterator ngram3 = new NgramIterator(3, s);
// // while(ngram3.hasNext()) {
// // set.add(ngram3.next().toLowerCase());
// // }
// }
// return set;
// }
//
// public List<String> getDbPediaLabel(final String uri)
// throws QueryException, QueryParseException {
public List<String> getDbPediaLabel(final String uri) throws QueryException, QueryParseException {
final List<String> labellist = new LinkedList<String>();
try {
final String query = "SELECT ?label WHERE{ <" + uri
+ "> <http://www.w3.org/2000/01/rdf-schema#label> ?label. }";
ResultSet results = null;
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query);
qexec = QueryExecutionFactory.create(cquery, this.labelmodel);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
final String label = sol.getLiteral("label").getLexicalForm();
labellist.add(label);
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
return labellist;
}
public String getDbPediaShortDescription(final String uri) throws QueryException, QueryParseException {
String labellist = "";
try {
final String query = "SELECT ?comment WHERE{ <" + uri
+ "> <http://www.w3.org/2000/01/rdf-schema#comment> ?comment. }";
ResultSet results = null;
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query);
qexec = QueryExecutionFactory.create(cquery, this.shortdescmodel);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
String desc = sol.getLiteral("comment").getLexicalForm();
labellist = desc;
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
return labellist;
}
public String getDbPediaLongDescription(final String uri) throws QueryException, QueryParseException {
String labellist = "";
try {
final String query = "SELECT ?comment WHERE{ <" + uri
+ "> <http://dbpedia.org/ontology/abstract> ?comment. }";
ResultSet results = null;
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query);
qexec = QueryExecutionFactory.create(cquery, this.longdescmodel);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
final String desc = sol.getLiteral("comment").getLexicalForm();
labellist = desc;
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
return labellist;
}
public void readEntities() {
File f = new File(ENTITYLIST);
try {
String line = null;
BufferedReader reader = new BufferedReader(new FileReader(f));
while ((line = reader.readLine()) != null) {
String uri = URLDecoder.decode(line, "UTF-8").replaceAll("http://dbpedia.org/resource/", "");
entities.add(WikiPediaUriConverter.createConformDBpediaURI(uri));
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public void insertWebOccurrences() {
File dir = new File(WEBOCCURRENCESDIRECTORY);
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
try {
BufferedReader reader = new BufferedReader(new FileReader(files[i]));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith("MENTION")) {
String[] splitter = line.split("\\t");
String mention = splitter[1];
String uri = WikiPediaUriConverter
.createConformDBpediaURI(splitter[3].replaceAll("http://en.wikipedia.org/wiki/", ""));
// System.out.println("Mention: "+mention+" Uri: "+uri);
if (UNIQUELABELSTRINGS.containsKey(uri)) {
HashSet<String> set = UNIQUELABELSTRINGS.get(uri);
set.add(mention.toLowerCase());
} else {
HashSet<String> set = new HashSet<String>();
set.add(mention.toLowerCase());
UNIQUELABELSTRINGS.put(uri, set);
}
}
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void addOccurrence(String uri, String sf, int amount) {
HashMap<String, Integer> occ = OCCURRENCES.get(uri);
if (occ.containsKey(sf)) {
int i = occ.get(sf);
i += amount;
occ.put(sf, i);
} else {
occ.put(sf, amount);
}
if (UNIQUELABELSTRINGS.containsKey(uri)) {
HashSet<String> set = UNIQUELABELSTRINGS.get(uri);
set.add(sf.toLowerCase());
addUniqueCandidateWithoutSpecialChars(set, sf);
} else {
HashSet<String> set = new HashSet<String>();
set.add(sf.toLowerCase());
addUniqueCandidateWithoutSpecialChars(set, sf);
UNIQUELABELSTRINGS.put(uri, set);
}
}
private void addUniqueCandidateWithoutSpecialChars(HashSet<String> set, String sf) {
String newsf = sf.toLowerCase().replaceAll("[^a-zA-Z ]", "");
if (newsf.length() > 2) {
set.add(newsf);
}
}
public HashSet<String> addAdditionalPersonNameOccurrences(String res) {
HashSet<String> names = new HashSet<String>();
try {
final String query = "SELECT ?surname WHERE{ <" + res + "> <http://xmlns.com/foaf/0.1/surname> ?surname. }";
ResultSet results = null;
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query);
qexec = QueryExecutionFactory.create(cquery, this.persondata);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
final String surname = sol.getLiteral("surname").getLexicalForm();
names.add(surname.toLowerCase());
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
// Constraint dass es eine Person ist
if (names.size() > 0) {
String rdfslabel = "";
try {
final String query = "SELECT ?label WHERE{ <" + res
+ "> <http://www.w3.org/2000/01/rdf-schema#label> ?label. }";
ResultSet results = null;
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query);
qexec = QueryExecutionFactory.create(cquery, this.labelmodel);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
final String label = sol.getLiteral("label").getLexicalForm();
rdfslabel = label;
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
String splitter[] = rdfslabel.split(" ");
if (splitter.length > 2) {
// Generiere verschiedene Namensmöglichkeiten
for (int i = 0; i < splitter.length; i++) {
for (int j = 0; j < splitter.length; j++) {
if (!splitter[i].equalsIgnoreCase(splitter[j])) {
names.add((splitter[i] + " " + splitter[j]).toLowerCase());
}
}
}
}
}
return names;
}
public void readWikiPageDisambiguation() {
Model m = ModelFactory.createDefaultModel();
m.read(DISAMBIGUATIONWIKILINKS);
StmtIterator it = m.listStatements();
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) {
String label = subject.getURI().replaceAll("http://dbpedia.org/resource/", "");
label = label.replaceAll("\\_\\(disambiguation\\)", "").toLowerCase();
if (UNIQUELABELSTRINGS.containsKey(obj.getURI())) {
HashSet<String> set = UNIQUELABELSTRINGS.get(obj.getURI());
set.add(label);
addUniqueCandidateWithoutSpecialChars(set, label);
} else {
HashSet<String> set = new HashSet<String>();
set.add(label);
addUniqueCandidateWithoutSpecialChars(set, label);
UNIQUELABELSTRINGS.put(obj.getURI(), set);
}
}
}
}
}
public void sportsTeamsSurfaceForms() {
Model m = ModelFactory.createDefaultModel();
m.read(INSTANCEMAPPINGTYPES_NT);
StmtIterator it = m.listStatements();
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (obj.getURI().equalsIgnoreCase("http://dbpedia.org/ontology/SportsTeam")) {
teams.add(subject.getURI());
}
}
}
}
private HashSet<String> extractSportsTeamNames(HashSet<String> set, String uri) {
HashSet<String> newStringSet = new HashSet<String>();
for (String s : set) {
String splitter[] = s.split(" ");
for (int i = 0; i < splitter.length; i++) {
if (splitter[i].equalsIgnoreCase(splitter[i].replaceAll("[^a-zA-Z ]", ""))) {
if (splitter[i].toLowerCase().length() > 3) {
newStringSet.add(splitter[i].toLowerCase());
}
}
}
}
uri = uri.replaceAll("http://dbpedia.org/resource/", "");
String[] splitter = uri.split("_");
if (splitter.length == 2) {
String newuri = "http://dbpedia.org/resource/" + splitter[0];
if (entities.contains(newuri)) {
System.out.println("SPORTSTEAM: " + splitter[0].toLowerCase() + " " + uri);
newStringSet.add(splitter[0].toLowerCase());
}
} else if (splitter.length > 2) {
String newuri = "http://dbpedia.org/resource/" + splitter[0];
if (entities.contains(newuri)) {
System.out.println("SPORTSTEAM: " + splitter[0].toLowerCase() + " " + uri);
newStringSet.add(splitter[0].toLowerCase());
}
newuri = "http://dbpedia.org/resource/" + splitter[0] + "_" + splitter[1];
if (entities.contains(newuri)) {
String s = splitter[0] + " " + splitter[1];
newStringSet.add(s.toLowerCase());
System.out.println("SPORTSTEAM: " + s.toLowerCase() + " " + uri);
}
}
return newStringSet;
}
private String filterStandardDomain(Set<String> set) {
String res = "Misc";
for (String s : set) {
if (s.equalsIgnoreCase("http://dbpedia.org/ontology/Person")) {
res = "Person";
break;
} else if (s.equalsIgnoreCase("http://dbpedia.org/ontology/Organisation")) {
res = "Organisation";
break;
} else if (s.equalsIgnoreCase("http://www.ontologydesignpatterns.org/ont/d0.owl#Location")) {
res = "Location";
break;
}
}
return res;
}
public Set<String> getRDFTypesFromEntity(final String entityUri) {
Set<String> set = new HashSet<String>();
final String query = "SELECT ?types WHERE{ <" + entityUri
+ "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?types. }";
ResultSet results = null;
QueryExecution qexec = null;
try {
final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query);
qexec = QueryExecutionFactory.create(cquery, instancemappingtypes);
results = qexec.execSelect();
} catch (final QueryException e) {
Logger.getRootLogger().error(e.getStackTrace());
} finally {
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
final String type = sol.getResource("types").toString();
set.add(type);
}
}
}
return set;
}
public void addSomeAbbreviations() {
for (Map.Entry<String, HashSet<String>> entry : this.UNIQUELABELSTRINGS.entrySet()) {
String url = entry.getKey();
HashSet<String> occs = entry.getValue();
String type = filterStandardDomain(getRDFTypesFromEntity(url));
if (type.equals("Location")) {
String tempuri = url.replaceAll("http://dbpedia.org/resource/", "").toLowerCase();
tempuri = tempuri.replaceAll("_", " ");
StringBuilder builder = new StringBuilder();
String splitter[] = tempuri.split(" ");
if (splitter.length > 1) {
for (int i = 0; i < splitter.length; i++) {
builder.append(splitter[i].substring(0, 1));
builder.append(".");
}
occs.add(builder.toString());
}
}
}
}
public void addAdditionalSurfaceForms() {
// Hack
for (String s : entities) {
if (!urlentitymapping.containsKey(s.toLowerCase())) {
urlentitymapping.put(s.toLowerCase(), s);
}
}
File folder = new File(SURFACEFORMDIRECTORY);
File[] files = folder.listFiles();
for (int i = 0; i < files.length; i++) {
File f = files[i];
try {
XMLReader xmlReader = XMLReaderFactory.createXMLReader();
FileReader reader = new FileReader(f);
InputSource inputSource = new InputSource(reader);
Handler handler = new Handler();
xmlReader.setContentHandler(handler);
xmlReader.parse(inputSource);
} catch (SAXException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
File dirn3 = new File(SURFACEFORMDIRECTORYN3);
File[] n3files = dirn3.listFiles();
for (int i = 0; i < n3files.length; i++) {
File f = n3files[i];
try {
BufferedReader reader = new BufferedReader(new FileReader(f));
String line = null;
String sf = null;
String entity = null;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.startsWith("nif:anchorOf")) {
String[] splitter = line.split("\"");
if (splitter.length > 1) {
sf = splitter[1].toLowerCase();
}
}
if (line.startsWith("itsrdf:taIdentRef")) {
String[] splitter = line.split("<");
if (splitter.length > 1) {
entity = splitter[1].split(">")[0];
}
}
if (sf != null && entity != null) {
System.out.println("SF: " + sf + " Entity: " + entity);
if (UNIQUELABELSTRINGS.containsKey(entity)) {
Set<String> strings = UNIQUELABELSTRINGS.get(entity);
strings.add(sf);
}
sf = null;
entity = null;
}
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
// addCustomSurfaceForm("http://dbpedia.org/resource/Annual_Meetings_of_the_International_Monetary_Fund_and_the_World_Bank_Group",
// "annual meetings of the international monetary fund");
addCustomSurfaceForm("http://dbpedia.org/resource/Port_of_Turku", "turku's harbor");
addCustomSurfaceForm("http://dbpedia.org/resource/Rear-end_collision", "rear-ended");
addCustomSurfaceForm("http://dbpedia.org/resource/Song", "ngs b");
addCustomSurfaceForm("http://dbpedia.org/resource/Finnish_sauna", "finnish bathhouses");
addCustomSurfaceForm("http://dbpedia.org/resource/Autonomous_car", "vehicles that can drive themselves");
addCustomSurfaceForm("http://dbpedia.org/resource/Free_association_(psychology)", "free-associative");
addCustomSurfaceForm("http://dbpedia.org/resource/Leaf_shape", "leaf-shaped");
addCustomSurfaceForm("http://dbpedia.org/resource/CSKA_Moscow_Stadium", "arena of CSKA Moscow");
addCustomSurfaceForm("http://dbpedia.org/resource/Capital_of_Germany", "german capital's");
addCustomSurfaceForm("http://dbpedia.org/resource/MSN", "msn network");
addCustomSurfaceForm("http://dbpedia.org/resource/Sprint_Corporation", "sprint communications co");
addCustomSurfaceForm("http://dbpedia.org/resource/Abdelbaset_al-Megrahi", "abdulbasit al-maqrahi");
}
private void addCustomSurfaceForm(String url, String sf) {
if (UNIQUELABELSTRINGS.containsKey(url)) {
Set<String> s = UNIQUELABELSTRINGS.get(url);
s.add(sf);
}
}
class Handler implements ContentHandler {
private String currentValue;
private String surfaceForm;
private String entityUrl;
Handler() {
super();
surfaceForm = new String("");
entityUrl = new String("");
}
@Override
public void characters(char[] arg0, int arg1, int arg2) throws SAXException {
currentValue += new String(arg0, arg1, arg2);
}
@Override
public void endDocument() throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void endElement(String arg0, String arg1, String arg2) throws SAXException {
if (arg1.equals("SurfaceForm")) {
this.surfaceForm = currentValue;
}
if (arg1.equals("ChosenAnnotation")) {
this.entityUrl = currentValue;
if (!surfaceForm.equals("") && !entityUrl.equals("")) {
entityUrl = entityUrl.trim();
entityUrl = entityUrl.replaceAll("http://en.wikipedia.org/wiki/", "");
surfaceForm = surfaceForm.trim();
entityUrl = WikiPediaUriConverter.createConformDBpediaURI(entityUrl);
entityUrl = entityUrl.toLowerCase();
if (urlentitymapping.containsKey(entityUrl)) {
HashSet<String> set = UNIQUELABELSTRINGS.get(urlentitymapping.get(entityUrl));
// System.out.println("SurfaceForm: " +
// surfaceForm.toLowerCase().replaceAll("_", " ") + "
// URL "
// + urlentitymapping.get(entityUrl));
if (set != null) {
set.add(surfaceForm.toLowerCase().replaceAll("_", " "));
}
}
}
}
}
@Override
public void endPrefixMapping(String arg0) throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void processingInstruction(String arg0, String arg1) throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void setDocumentLocator(Locator arg0) {
// TODO Auto-generated method stub
}
@Override
public void skippedEntity(String arg0) throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void startDocument() throws SAXException {
// TODO Auto-generated method stub
}
@Override
public void startElement(String arg0, String arg1, String arg2, Attributes arg3) throws SAXException {
if (arg2.equals("SurfaceForm")) {
surfaceForm = "";
entityUrl = "";
}
if (arg2.equals("SurfaceForm")) {
this.currentValue = "";
}
if (arg2.equals("ChosenAnnotation")) {
this.currentValue = "";
}
}
@Override
public void startPrefixMapping(String arg0, String arg1) throws SAXException {
// TODO Auto-generated method stub
}
}
public static void main(String[] args) {
CreateDBpediaIndexV2 index = new CreateDBpediaIndexV2();
// List<String> l = new LinkedList<String>();
// l.add("France Agence-Press");
// System.out.println(index.createDBpediaOccs(l));
System.out.println("Step-1: Load Evidences");
// index.loadEvidences();
System.out.println("Step0: Create DBpediaPriors");
index.createDBpediaPriors();
System.out.println("Step1: Read Sportsteams");
index.sportsTeamsSurfaceForms();
System.out.println("Step2: Read Wikipedia Disambiguation Links");
index.readWikiPageDisambiguation();
System.out.println("Step3: Read Entity List");
index.readEntities();
System.out.println("Step4: DBPediaFacts");
index.fillRelationsIndex();
System.out.println("Step5: DBPediaProperties");
index.fillPropertiesIndex();
System.out.println("Step6: PattyFacts");
index.fillPattyRelationIndex(PATTYWIKIPATTERN, PATTYWIKIINSTANCE);
System.out.println("Step7: PattyFreebaseFacts");
index.fillPattyFreebaseRelationIndex(PATTYFREEBASEPATTERN, PATTYFREEBASEINSTANCE);
System.out.println("Step8: WorkLinkText");
index.workLinkText();
System.out.println("Step9: ReadOldIndex");
index.getUniqueLabelsFromOldIndex();
System.out.println("Step10: WorkEntities");
index.workEntities();
System.out.println("Step11: WorkRedirects");
index.workRedirects();
System.out.println("Step12: WebOccurrences");
index.insertWebOccurrences();
System.out.println("Step13: CreateSomeAbbreviations");
index.addSomeAbbreviations();
System.out.println("Step15: AddSomeSurfaceForms");
index.addAdditionalSurfaceForms();
System.out.println("Step16: CreateIndex");
index.createNewIndex();
// CreateDBpediaIndexV2 index = new CreateDBpediaIndexV2();
// index.addAdditionalSurfaceForms();
}
}