package doser.tools.indexcreation;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdtjena.HDTGraph;
import com.hp.hpl.jena.query.QueryException;
import com.hp.hpl.jena.query.QueryExecution;
import com.hp.hpl.jena.query.QueryExecutionFactory;
import com.hp.hpl.jena.query.QueryFactory;
import com.hp.hpl.jena.query.QueryParseException;
import com.hp.hpl.jena.query.QuerySolution;
import com.hp.hpl.jena.query.ResultSet;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import doser.lucene.analysis.DoserIDAnalyzer;
import doser.lucene.analysis.DoserStandardAnalyzer;
public class CreateDBPediaIndex {
public static final String LABELHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/labels_en.hdt";
public static final String SHORTDESCHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/short_abstracts_en.hdt";
public static final String LONGDESCHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/long_abstracts_en.hdt";
public static final String MAPPINGPROPERTIES = "/home/zwicklbauer/HDTGeneration/mappingbased_properties_cleaned_en.nt";
public static final String PATTYWIKIPATTERN = "/home/zwicklbauer/Patty/patty-dataset-WikiTypes/wikipedia-patterns.txt";
public static final String PATTYWIKIINSTANCE = "/home/zwicklbauer/Patty/patty-dataset-WikiTypes/wikipedia-instances.txt";
public static final String PATTYFREEBASEPATTERN = "/home/zwicklbauer/Patty/patty-dataset-freebase/wikipedia-patterns.txt";
public static final String PATTYFREEBASEINSTANCE = "/home/zwicklbauer/Patty/patty-dataset-freebase/wikipedia-instances.txt";
public static final String LINKTEXT = "/home/zwicklbauer/WikipediaEntities/enwiki-latest/linktext";
public static final String ENTITIES = "/home/zwicklbauer/WikipediaEntities/entities_StandardParse_threshold12";
public static final String REDIRECTS = "/home/zwicklbauer/WikipediaEntities/enwiki-latest/redirects";
public static final String EVIDENCEDIRECTORY = "/mnt/ssd1/evidence/*/";
public static final String WEBOCCURRENCESDIRECTORY = "/home/zwicklbauer/WikipediaEntities/EntitiesWebContext/";
public static final String OLDINDEX = "/mnt/ssd1/disambiguation/MMapLuceneIndexStandard/";
public static final String NEWINDEX = "/home/zwicklbauer/NewIndexTryout";
private HashMap<String, HashSet<String>> LABELS;
private HashMap<String, HashSet<String>> UNIQUELABELSTRINGS;
private HashMap<String, HashMap<String, Integer>> OCCURRENCES;
// private HashMap<String, String> OCCURRENCES;
private HashMap<String, LinkedList<String>> relationmap;
private HashMap<String, LinkedList<String>> pattymap;
private HashMap<String, LinkedList<String>> pattyfreebasemap;
private HashMap<String, HashSet<String>> evidencemap;
private static int evidencecounter = 0;
private Model labelmodel;
private Model shortdescmodel;
private Model longdescmodel;
private int counter;
CreateDBPediaIndex() {
super();
this.LABELS = new HashMap<String, HashSet<String>>();
this.UNIQUELABELSTRINGS = new HashMap<String, HashSet<String>>();
this.OCCURRENCES = new HashMap<String, HashMap<String, Integer>>();
this.relationmap = new HashMap<String, LinkedList<String>>();
this.pattymap = new HashMap<String, LinkedList<String>>();
this.pattyfreebasemap = new HashMap<String, LinkedList<String>>();
this.evidencemap = new HashMap<String, HashSet<String>>();
HDT labelhdt;
HDT shortdeschdt;
HDT longdeschdt;
try {
labelhdt = HDTManager.mapIndexedHDT(LABELHDT, null);
shortdeschdt = HDTManager.mapIndexedHDT(SHORTDESCHDT, null);
longdeschdt = HDTManager.mapIndexedHDT(LONGDESCHDT, null);
final HDTGraph labelhdtgraph = new HDTGraph(labelhdt);
final HDTGraph shortdeschdtgraph = new HDTGraph(shortdeschdt);
final HDTGraph longdeschdtgraph = new HDTGraph(longdeschdt);
this.labelmodel = ModelFactory.createModelForGraph(labelhdtgraph);
this.shortdescmodel = ModelFactory
.createModelForGraph(shortdeschdtgraph);
this.longdescmodel = ModelFactory
.createModelForGraph(longdeschdtgraph);
} catch (IOException e) {
e.printStackTrace();
}
this.counter = 0;
}
/**
* Fill Occurrences and UniqueLabelStrings
*
* Speichere alle Entity Occurrences und UniqueLabelStrings
*/
public void workLinkText() {
File f = new File(LINKTEXT);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(f));
String line = null;
while ((line = reader.readLine()) != null) {
String split[] = line.split("\\t");
if (split.length > 2) {
for (int i = 2; i < split.length; ++i) {
String ent = split[i];
String[] occ = ent.split(":");
// Bugfix: Fix Wrong splitting
StringBuilder builder = new StringBuilder();
for (int j = 0; j < occ.length - 1; j++) {
builder.append(occ[j] + ":");
}
String nr = occ[occ.length - 1];
String entity = builder.toString();
entity = entity.substring(0, entity.length() - 1);
String uri = WikiPediaUriConverter
.createConformDBpediaURI(entity);
if (!uri.contains("(Disambiguation)")) {
// System.out.println(uri);
// UniqueLabelStrings
if (UNIQUELABELSTRINGS.containsKey(uri)) {
HashSet<String> set = UNIQUELABELSTRINGS
.get(uri);
set.add(split[0].toLowerCase());
} else {
HashSet<String> set = new HashSet<String>();
set.add(split[0].toLowerCase());
UNIQUELABELSTRINGS.put(uri, set);
}
// Occurrences
if(!OCCURRENCES.containsKey(uri)) {
HashMap<String, Integer> map = new HashMap<String, Integer>();
OCCURRENCES.put(uri, map);
}
addOccurrence(uri, split[0].toLowerCase(), Integer.valueOf(nr));
}
}
}
}
File oldIndexFile = new File(OLDINDEX);
IndexReader readerOldIndex = null;
try {
final Directory oldDir = FSDirectory.open(oldIndexFile);
readerOldIndex = DirectoryReader.open(oldDir);
for (int j = 0; j < readerOldIndex.maxDoc(); ++j) {
Document oldDoc = readerOldIndex.document(j);
String[] oldUniqueLabels = oldDoc
.getValues("UniqueLabelString");
String oldResource = oldDoc.get("Mainlink");
// Transform old to new Namespace
oldResource = oldResource.replaceAll(
"http://dbpedia.org/resource/", "");
oldResource = URLDecoder.decode(oldResource, "UTF-8");
oldResource = WikiPediaUriConverter
.createConformDBpediaURI(oldResource);
if (!UNIQUELABELSTRINGS.containsKey(oldResource)) {
UNIQUELABELSTRINGS.put(oldResource, new HashSet<String>());
}
HashSet<String> set = UNIQUELABELSTRINGS.get(oldResource);
if (oldUniqueLabels != null
&& oldUniqueLabels.length > 0) {
for (int k = 0; k < oldUniqueLabels.length; ++k) {
set.add(oldUniqueLabels[k].toLowerCase());
}
}
if (!OCCURRENCES.containsKey(oldResource)) {
OCCURRENCES.put(oldResource, new HashMap<String, Integer>());
}
String oldOccurrences = oldDoc.get("Occurrences");
if ((oldOccurrences != null)
&& !oldOccurrences.equalsIgnoreCase("")) {
final String[] splitter = oldOccurrences
.split(";;;");
for (final String element : splitter) {
final String[] splitter1 = element
.split(":::");
int check = 1;
try {
check = Integer.valueOf(splitter1[1]);
} catch (final NumberFormatException e) {
Logger.getRootLogger()
.error("Warning NumberFormatException while Initialization: ");
}
addOccurrence(oldResource, splitter1[0], check);
}
}
}
readerOldIndex.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (readerOldIndex != null) {
try {
readerOldIndex.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// System.out.println(UNIQUELABELSTRINGS.size());
}
public void workEntities() {
File f = new File(ENTITIES);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(f));
String line = null;
while ((line = reader.readLine()) != null) {
String split[] = line.split("\\t");
if (split.length > 2) {
for (int i = 2; i < split.length; ++i) {
String ent = split[i];
String[] occ = ent.split(":");
String uri = WikiPediaUriConverter
.createConformDBpediaURI(occ[0]);
// Synonyms
if (LABELS.containsKey(uri)) {
HashSet<String> set = LABELS.get(uri);
// Add Label to UniqueLabel
addLabelToUniqueLabel(uri, split[0]);
set.add(split[0].toLowerCase());
} else {
HashSet<String> set = new HashSet<String>();
// set.add(occ[0].toLowerCase());
set.add(split[0].toLowerCase());
// Add Label to UniqueLabel
addLabelToUniqueLabel(uri, split[0]);
LABELS.put(uri, set);
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private void addLabelToUniqueLabel(String entity, String label) {
if (UNIQUELABELSTRINGS.containsKey(entity)) {
HashSet<String> set = UNIQUELABELSTRINGS.get(entity);
set.add(label.toLowerCase());
}
}
public void workRedirects() {
File f = new File(REDIRECTS);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(f));
String line = null;
while ((line = reader.readLine()) != null) {
String split[] = line.split("\\t");
// Bug Fix of wrong redirects
if (split.length < 3) {
String uri = WikiPediaUriConverter
.createConformDBpediaURI(split[1]);
if (LABELS.containsKey(uri)) {
HashSet<String> set = LABELS.get(uri);
set.add(split[0].toLowerCase());
// Add Label to UniqueLabel
addLabelToUniqueLabel(uri, split[0]);
} else {
HashSet<String> set = new HashSet<String>();
set.add(split[0].toLowerCase());
// Add Label to UniqueLabel
addLabelToUniqueLabel(uri, split[0]);
LABELS.put(uri, set);
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void createNewIndex() {
File newIndexFile = new File(NEWINDEX);
try {
final Directory newDir = FSDirectory.open(newIndexFile);
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
analyzerPerField.put("Label", new DoserStandardAnalyzer());
analyzerPerField.put("PattyRelations", new DoserIDAnalyzer());
analyzerPerField.put("PattyFreebaseRelations",
new DoserIDAnalyzer());
analyzerPerField.put("Relations", new DoserIDAnalyzer());
analyzerPerField.put("Occurrences", new DoserIDAnalyzer());
PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(
new StandardAnalyzer(), analyzerPerField);
final IndexWriterConfig config = new IndexWriterConfig(
Version.LATEST, aWrapper);
final IndexWriter newIndexWriter = new IndexWriter(newDir, config);
for (Map.Entry<String, HashSet<String>> entry : UNIQUELABELSTRINGS
.entrySet()) {
String uri = entry.getKey();
HashSet<String> set = entry.getValue();
Document doc = new Document();
// Add ID
doc.add(new StringField("ID", "DBpedia_"
+ String.valueOf(counter), Store.YES));
counter++;
// Add Mainlink
doc.add(new StringField("Mainlink", uri, Store.YES));
// Add labels
List<String> origLabels = getDbPediaLabel(uri);
HashSet<String> labelset = LABELS.get(uri);
if (labelset != null) {
for (String s : origLabels) {
labelset.add(s);
}
}
if (LABELS.containsKey(uri)) {
labelset = LABELS.get(uri);
for (String s : labelset) {
doc.add(new TextField("Label", s, Store.YES));
}
}
// Add ShortDescriptions
String shortDescription = getDbPediaShortDescription(uri);
doc.add(new TextField("ShortDescription", shortDescription,
Store.YES));
// Add longDescriptions
String longDescription = getDbPediaLongDescription(uri);
doc.add(new TextField("LongDescription", longDescription,
Store.YES));
// UniqueLabelStrings
for (String s : set) {
doc.add(new StringField("UniqueLabel", s, Store.YES));
}
// Add Occurrences
if (OCCURRENCES.containsKey(uri)) {
// First Build Occurrences String
HashMap<String, Integer> map = OCCURRENCES.get(uri);
StringBuilder builder = new StringBuilder();
for (Map.Entry<String, Integer> ent : map.entrySet()) {
String key = ent.getKey();
Integer value = ent.getValue();
builder.append(key + ":::" + String.valueOf(value)
+ ";;;");
}
String occs = builder.toString();
if (occs.length() > 0) {
occs = occs.substring(0, occs.length() - 3);
}
doc.add(new TextField("Occurrences", occs, Store.YES));
}
// Add DBPedia Facts
if (relationmap.containsKey(uri)) {
LinkedList<String> l = relationmap.get(uri);
StringBuilder builder = new StringBuilder();
if (l != null) {
for (String str : l) {
builder.append(str);
builder.append(";;;");
}
}
String s = builder.toString();
if (s.length() > 0) {
s = s.substring(0, s.length() - 3);
}
doc.add(new TextField("Relations", s, Store.YES));
} else {
doc.add(new TextField("Relations", "", Store.YES));
}
// Add PattyFacts
if (pattymap.containsKey(uri)) {
LinkedList<String> l = pattymap.get(uri);
StringBuilder builder = new StringBuilder();
if (l != null) {
for (String str : l) {
builder.append(str);
builder.append(";;;");
}
}
String s = builder.toString();
if (s.length() > 0) {
s = s.substring(0, s.length() - 3);
}
doc.add(new TextField("PattyRelations", s, Store.YES));
} else {
doc.add(new TextField("PattyRelations", "", Store.YES));
}
// Add PattyFreebaseFacts
if (pattyfreebasemap.containsKey(uri)) {
LinkedList<String> l = pattyfreebasemap.get(uri);
StringBuilder builder = new StringBuilder();
if (l != null) {
for (String str : l) {
builder.append(str);
builder.append(";;;");
}
}
String s = builder.toString();
if (s.length() > 0) {
s = s.substring(0, s.length() - 3);
}
doc.add(new TextField("PattyFreebaseRelations", s,
Store.YES));
} else {
doc.add(new TextField("PattyFreebaseRelations", "",
Store.YES));
}
// Add Entity Evidence
// if (evidencemap.containsKey(uri)) {
// HashSet<String> evidenceset = evidencemap.get(uri);
// for (String evidence : evidenceset) {
// doc.add(new StringField("Evidence", evidence,
// Field.Store.YES));
// evidencecounter++;
// }
// }
// Write Document To Index
if (doc.get("Label") != null
&& !doc.get("Label").equalsIgnoreCase("")) {
newIndexWriter.addDocument(doc);
}
}
newIndexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public List<String> getDbPediaLabel(final String uri)
throws QueryException, QueryParseException {
final List<String> labellist = new LinkedList<String>();
try {
final String query = "SELECT ?label WHERE{ <"
+ uri
+ "> <http://www.w3.org/2000/01/rdf-schema#label> ?label. }";
ResultSet results = null; // NOPMD by quh on 14.02.14 10:04
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory
.create(query);
qexec = QueryExecutionFactory.create(cquery, this.labelmodel);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
final String label = sol.getLiteral("label")
.getLexicalForm();
labellist.add(label);
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
return labellist;
}
public String getDbPediaShortDescription(final String uri)
throws QueryException, QueryParseException {
String labellist = "";
try {
final String query = "SELECT ?comment WHERE{ <"
+ uri
+ "> <http://www.w3.org/2000/01/rdf-schema#comment> ?comment. }";
ResultSet results = null;
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory
.create(query);
qexec = QueryExecutionFactory.create(cquery, this.shortdescmodel);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
String desc = sol.getLiteral("comment").getLexicalForm();
labellist = desc;
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
return labellist;
}
public String getDbPediaLongDescription(final String uri)
throws QueryException, QueryParseException {
String labellist = "";
try {
final String query = "SELECT ?comment WHERE{ <" + uri
+ "> <http://dbpedia.org/ontology/abstract> ?comment. }";
ResultSet results = null;
QueryExecution qexec = null;
final com.hp.hpl.jena.query.Query cquery = QueryFactory
.create(query);
qexec = QueryExecutionFactory.create(cquery, this.longdescmodel);
results = qexec.execSelect();
if (results != null) {
while (results.hasNext()) {
final QuerySolution sol = results.nextSolution();
final String desc = sol.getLiteral("comment")
.getLexicalForm();
labellist = desc;
}
qexec.close();
}
} catch (QueryParseException e) {
Logger.getRootLogger().info("Query parse Exception");
}
return labellist;
}
public void fillRelationsIndex() {
Model m = ModelFactory.createDefaultModel();
m.read(MAPPINGPROPERTIES);
StmtIterator it = m.listStatements();
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource()
&& obj.getURI().startsWith(
"http://dbpedia.org/resource/")) {
if (!relationmap.containsKey(subject.getURI())) {
LinkedList<String> list = new LinkedList<String>();
relationmap.put(subject.getURI(), list);
}
LinkedList<String> l = relationmap.get(subject.getURI());
l.add(pra.getURI().replaceAll(
"http://dbpedia.org/ontology/", "dbpediaOnt/")
+ ":::"
+ obj.getURI().replaceAll(
"http://dbpedia.org/resource/",
"dbpediaRes/"));
}
}
}
}
public void fillPattyRelationIndex(String pattern, String instance) {
File patternFile = new File(pattern);
HashMap<Integer, String> patternMap = new HashMap<Integer, String>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(patternFile));
reader.readLine();
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
Integer i = null;
try {
i = new Integer(Integer.valueOf(splitter[0]));
} catch (NumberFormatException e) {
e.printStackTrace();
}
patternMap.put(i, splitter[1]);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// Read Instancefile - either WikiTypes or Freebase Types
File instanceFile = new File(instance);
reader = null;
try {
reader = new BufferedReader(new FileReader(instanceFile));
reader.readLine();
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
Integer j = null;
try {
j = new Integer(Integer.valueOf(splitter[0]));
} catch (NumberFormatException e) {
e.printStackTrace();
}
String subject = WikiPediaUriConverter
.createConformDBpediaURI(splitter[1]);
String object = WikiPediaUriConverter.createConformDBpediaURI(
splitter[2]).replaceAll("http://dbpedia.org/resource/",
"");
if (!pattymap.containsKey(subject)) {
LinkedList<String> list = new LinkedList<String>();
pattymap.put(subject, list);
}
LinkedList<String> l = pattymap.get(subject);
l.add("patty/" + patternMap.get(j) + ":::" + "dbpediaRes/"
+ object);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void fillPattyFreebaseRelationIndex(String pattern, String instance) {
File patternFile = new File(pattern);
HashMap<Integer, String> patternMap = new HashMap<Integer, String>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(patternFile));
reader.readLine();
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
Integer i = null;
try {
i = new Integer(Integer.valueOf(splitter[0]));
} catch (NumberFormatException e) {
e.printStackTrace();
}
patternMap.put(i, splitter[1]);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// Read Instancefile - either WikiTypes or Freebase Types
File instanceFile = new File(instance);
reader = null;
try {
reader = new BufferedReader(new FileReader(instanceFile));
reader.readLine();
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
Integer j = null;
try {
j = new Integer(Integer.valueOf(splitter[0]));
} catch (NumberFormatException e) {
e.printStackTrace();
}
String subject = WikiPediaUriConverter
.createConformDBpediaURI(splitter[1]);
String object = WikiPediaUriConverter.createConformDBpediaURI(
splitter[2]).replaceAll("http://dbpedia.org/resource/",
"");
if (!pattyfreebasemap.containsKey(subject)) {
LinkedList<String> list = new LinkedList<String>();
pattyfreebasemap.put(subject, list);
}
LinkedList<String> l = pattyfreebasemap.get(subject);
l.add("patty/" + patternMap.get(j) + ":::" + "dbpediaRes/"
+ object);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void extractEvidence() {
File dir = new File(EVIDENCEDIRECTORY);
File[] files = dir.listFiles();
for (int i = 0; i < files.length; ++i) {
File currentFile = files[i];
String name = WikiPediaUriConverter
.createConformDBpediaURI(currentFile.getName());
String line = null;
try {
BufferedReader reader = new BufferedReader(new FileReader(
currentFile));
while ((line = reader.readLine()) != null) {
String[] splitter = line.split(",");
if (evidencemap.containsKey(name)) {
HashSet<String> set = evidencemap.get(name);
set.add(splitter[0]);
// System.out.println(name+ " "+ splitter[0]);
} else {
HashSet<String> set = new HashSet<String>();
set.add(splitter[0]);
// System.out.println(name+ " "+ splitter[0]);
evidencemap.put(name, set);
}
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void insertWebOccurrences() {
File dir = new File(WEBOCCURRENCESDIRECTORY);
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
try {
BufferedReader reader = new BufferedReader(new FileReader(
files[i]));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith("MENTION")) {
String[] splitter = line.split("\\t");
String mention = splitter[1];
String uri = WikiPediaUriConverter
.createConformDBpediaURI(splitter[3]
.replaceAll(
"http://en.wikipedia.org/wiki/",
""));
// System.out.println("Mention: "+mention+" Uri: "+uri);
if (UNIQUELABELSTRINGS.containsKey(uri)) {
HashSet<String> set = UNIQUELABELSTRINGS.get(uri);
set.add(mention.toLowerCase());
} else {
HashSet<String> set = new HashSet<String>();
set.add(mention.toLowerCase());
UNIQUELABELSTRINGS.put(uri, set);
}
if (OCCURRENCES.containsKey(uri)) {
HashMap<String, Integer> map = OCCURRENCES.get(uri);
if (map.containsKey(mention)) {
Integer j = map.get(mention);
j++;
map.put(mention, j);
} else {
map.put(mention, new Integer(1));
}
} else {
HashMap<String, Integer> map = new HashMap<String, Integer>();
map.put(mention, new Integer(1));
OCCURRENCES.put(uri, map);
}
}
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void addOccurrence(String uri, String sf, int amount) {
HashMap<String, Integer> occ = OCCURRENCES.get(uri);
if (occ.containsKey(sf)) {
int i = occ.get(sf);
i += amount;
occ.put(sf, i);
} else {
occ.put(sf, amount);
}
}
public static void main(String[] args) {
CreateDBPediaIndex index = new CreateDBPediaIndex();
System.out.println("Preprocessing:");
System.out.println("Step Evidence: ");
// index.extractEvidence();
System.out.println("Fill Index Relations:");
index.fillRelationsIndex();
index.fillPattyRelationIndex(PATTYWIKIPATTERN, PATTYWIKIINSTANCE);
index.fillPattyFreebaseRelationIndex(PATTYFREEBASEPATTERN,
PATTYFREEBASEINSTANCE);
System.out.println("Step1:");
index.workLinkText();
System.out.println("Step2:");
index.workEntities();
System.out.println("Step3:");
index.workRedirects();
System.out.println("Step4:");
// index.insertWebOccurrences();
System.out.println("Step5:");
index.createNewIndex();
System.out.println("EvidenceCounter: " + evidencecounter);
}
}