package doser.tools.indexcreation; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.net.URLDecoder; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.jgrapht.UndirectedGraph; import org.jgrapht.graph.DefaultEdge; import org.jgrapht.graph.SimpleGraph; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdtjena.HDTGraph; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import com.hp.hpl.jena.query.QueryException; import com.hp.hpl.jena.query.QueryExecution; import com.hp.hpl.jena.query.QueryExecutionFactory; import com.hp.hpl.jena.query.QueryFactory; import com.hp.hpl.jena.query.QueryParseException; import com.hp.hpl.jena.query.QuerySolution; import com.hp.hpl.jena.query.ResultSet; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; import doser.lucene.analysis.DoserIDAnalyzer; public class CreateDBpediaIndexV2 { public static final String SURFACEFORMDIRECTORY = "/home/zwicklbauer/surfaceforms"; public static final String SURFACEFORMDIRECTORYN3 = "/home/zwicklbauer/surfaceforms/n3/"; public static final String OLDINDEX = "/mnt/ssd1/disambiguation/MMapLuceneIndexStandard/"; public static final String NEWINDEX = "/home/zwicklbauer/NewIndexTryout"; public static final String ENTITYLIST = "/home/zwicklbauer/WikipediaEntities/entityList_Default_HoffartNew.dat"; public static final String MAPPINGPROPERTIES = "/home/zwicklbauer/HDTGeneration/mappingbased_properties_cleaned_en.nt"; public static final String PERSONDATAHDT = "/mnt/ssd1/disambiguation/HDT/persondata_en.hdt"; public static final String INFOBOXPROPERTIES = "/home/zwicklbauer/HDTGeneration/infobox_properties_en.nt"; public static final String DISAMBIGUATIONWIKILINKS = "/home/zwicklbauer/HDTGeneration/disambiguations_en.nt"; public static final String PATTYWIKIPATTERN = "/home/zwicklbauer/Patty/patty-dataset-WikiTypes/wikipedia-patterns.txt"; public static final String PATTYWIKIINSTANCE = "/home/zwicklbauer/Patty/patty-dataset-WikiTypes/wikipedia-instances.txt"; public static final String PATTYFREEBASEPATTERN = "/home/zwicklbauer/Patty/patty-dataset-freebase/wikipedia-patterns.txt"; public static final String PATTYFREEBASEINSTANCE = "/home/zwicklbauer/Patty/patty-dataset-freebase/wikipedia-instances.txt"; public static final String EVIDENCEFILE = "/home/zwicklbauer/word2vec/evidences.dat"; public static final String WEBOCCURRENCESDIRECTORY = "/home/zwicklbauer/WikipediaEntities/EntitiesWebContext/"; public static final String LINKTEXT = "/home/zwicklbauer/WikipediaEntities/enwiki-latest/linktext"; public static final String ENTITIES = "/home/zwicklbauer/WikipediaEntities/entities_StandardParse_threshold12"; public static final String REDIRECTS = "/home/zwicklbauer/WikipediaEntities/enwiki-latest/redirects"; public static final String ARTICLECATEGORIES = "/home/zwicklbauer/HDTGeneration/article_categories_en.nt"; public static final String LABELHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/labels_en.hdt"; public static final String SHORTDESCHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/short_abstracts_en.hdt"; public static final String LONGDESCHDT = "/home/zwicklbauer/WikipediaIndexGeneration/rdffiles/long_abstracts_en.hdt"; public static final String INSTANCEMAPPINGTYPES = "/mnt/ssd1/disambiguation/HDT/instance_types_en.hdt"; public static final String INSTANCEMAPPINGTYPES_NT = "/mnt/ssd1/disambiguation/HDT/instance_types_en.nt"; public static final String SKOSBROADER = "/home/zwicklbauer/HDTGeneration/skos_categories_en.nt"; public static final String EXTERNSFDIRECTORY = "/home/zwicklbauer/SurfaceForms/"; private HashMap<String, HashSet<String>> LABELS; private HashSet<String> entities; private HashMap<String, LinkedList<String>> relationmap; private HashMap<String, LinkedList<String>> pattymap; private HashMap<String, LinkedList<String>> pattyfreebasemap; private HashMap<String, String> evidences; private HashSet<String> teams; private HashMap<String, HashSet<String>> UNIQUELABELSTRINGS; private HashMap<String, HashMap<String, Integer>> OCCURRENCES; private HashMap<String, Integer> DBPEDIAGRAPHINLINKS; private HashMap<String, String> urlentitymapping; private int counter; private Model labelmodel; private Model shortdescmodel; private Model longdescmodel; private Model persondata; private Model instancemappingtypes; public CreateDBpediaIndexV2() { super(); this.relationmap = new HashMap<String, LinkedList<String>>(); this.pattymap = new HashMap<String, LinkedList<String>>(); this.pattyfreebasemap = new HashMap<String, LinkedList<String>>(); this.OCCURRENCES = new HashMap<String, HashMap<String, Integer>>(); this.LABELS = new HashMap<String, HashSet<String>>(); this.UNIQUELABELSTRINGS = new HashMap<String, HashSet<String>>(); this.DBPEDIAGRAPHINLINKS = new HashMap<String, Integer>(); this.evidences = new HashMap<String, String>(); this.teams = new HashSet<String>(); this.urlentitymapping = new HashMap<String, String>(); this.entities = new HashSet<String>(); this.counter = 0; HDT labelhdt; HDT shortdeschdt; HDT longdeschdt; HDT mappingbasedproperties; HDT instancemappingtypeshdt; try { labelhdt = HDTManager.mapIndexedHDT(LABELHDT, null); shortdeschdt = HDTManager.mapIndexedHDT(SHORTDESCHDT, null); longdeschdt = HDTManager.mapIndexedHDT(LONGDESCHDT, null); mappingbasedproperties = HDTManager.mapIndexedHDT(PERSONDATAHDT, null); instancemappingtypeshdt = HDTManager.mapIndexedHDT(INSTANCEMAPPINGTYPES, null); final HDTGraph labelhdtgraph = new HDTGraph(labelhdt); final HDTGraph shortdeschdtgraph = new HDTGraph(shortdeschdt); final HDTGraph longdeschdtgraph = new HDTGraph(longdeschdt); final HDTGraph instancepersondata = new HDTGraph(mappingbasedproperties); final HDTGraph instancemappingtypesgraph = new HDTGraph(instancemappingtypeshdt); this.labelmodel = ModelFactory.createModelForGraph(labelhdtgraph); this.shortdescmodel = ModelFactory.createModelForGraph(shortdeschdtgraph); this.longdescmodel = ModelFactory.createModelForGraph(longdeschdtgraph); this.persondata = ModelFactory.createModelForGraph(instancepersondata); this.instancemappingtypes = ModelFactory.createModelForGraph(instancemappingtypesgraph); } catch (IOException e) { e.printStackTrace(); } } public void loadEvidences() { File file = new File(EVIDENCEFILE); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String line = null; while ((line = reader.readLine()) != null) { String splitter[] = line.split("\\t"); this.evidences.put(splitter[0], splitter[1]); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } public void createDBpediaPriors() { UndirectedGraph<String, DefaultEdge> graph = new SimpleGraph<String, DefaultEdge>(DefaultEdge.class); Model m = ModelFactory.createDefaultModel(); m.read(INFOBOXPROPERTIES); StmtIterator it = m.listStatements(); while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) { if (!subject.getURI().equalsIgnoreCase(obj.getURI())) { graph.addVertex(subject.getURI()); graph.addVertex(obj.getURI()); graph.addEdge(subject.getURI(), obj.getURI()); } } } counter++; } m = ModelFactory.createDefaultModel(); m.read(MAPPINGPROPERTIES); it = m.listStatements(); counter = 0; while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) { if (!subject.getURI().equalsIgnoreCase(obj.getURI())) { graph.addVertex(subject.getURI()); graph.addVertex(obj.getURI()); graph.addEdge(subject.getURI(), obj.getURI()); } } } } m = ModelFactory.createDefaultModel(); m.read(SKOSBROADER); it = m.listStatements(); while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) { if (!subject.getURI().equalsIgnoreCase(obj.getURI())) { graph.addVertex(subject.getURI()); graph.addVertex(obj.getURI()); graph.addEdge(subject.getURI(), obj.getURI()); } } } } m = ModelFactory.createDefaultModel(); m.read(ARTICLECATEGORIES); it = m.listStatements(); while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) { if (!subject.getURI().equalsIgnoreCase(obj.getURI())) { graph.addVertex(subject.getURI()); graph.addVertex(obj.getURI()); graph.addEdge(subject.getURI(), obj.getURI()); } } } } Set<String> set = graph.vertexSet(); for (String s : set) { DBPEDIAGRAPHINLINKS.put(s, graph.degreeOf(s)); } } public void fillPropertiesIndex() { Model m = ModelFactory.createDefaultModel(); m.read(INFOBOXPROPERTIES); StmtIterator it = m.listStatements(); while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) { if (!relationmap.containsKey(subject.getURI())) { LinkedList<String> list = new LinkedList<String>(); relationmap.put(subject.getURI(), list); } LinkedList<String> l = relationmap.get(subject.getURI()); l.add(pra.getURI().replaceAll("http://dbpedia.org/property/", "dbpediaOnt/") + ":::" + obj.getURI().replaceAll("http://dbpedia.org/resource/", "dbpediaRes/")); } } } } public void fillRelationsIndex() { Model m = ModelFactory.createDefaultModel(); m.read(MAPPINGPROPERTIES); StmtIterator it = m.listStatements(); while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) { if (!relationmap.containsKey(subject.getURI())) { LinkedList<String> list = new LinkedList<String>(); relationmap.put(subject.getURI(), list); } LinkedList<String> l = relationmap.get(subject.getURI()); l.add(pra.getURI().replaceAll("http://dbpedia.org/ontology/", "dbpediaOnt/") + ":::" + obj.getURI().replaceAll("http://dbpedia.org/resource/", "dbpediaRes/")); } } } } public void fillPattyRelationIndex(String pattern, String instance) { File patternFile = new File(pattern); HashMap<Integer, String> patternMap = new HashMap<Integer, String>(); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(patternFile)); reader.readLine(); String line = null; while ((line = reader.readLine()) != null) { String[] splitter = line.split("\\t"); Integer i = null; try { i = new Integer(Integer.valueOf(splitter[0])); } catch (NumberFormatException e) { e.printStackTrace(); } patternMap.put(i, splitter[1]); } } catch (IOException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } // Read Instancefile - either WikiTypes or Freebase Types File instanceFile = new File(instance); reader = null; try { reader = new BufferedReader(new FileReader(instanceFile)); reader.readLine(); String line = null; while ((line = reader.readLine()) != null) { String[] splitter = line.split("\\t"); Integer j = null; try { j = new Integer(Integer.valueOf(splitter[0])); } catch (NumberFormatException e) { e.printStackTrace(); } String subject = WikiPediaUriConverter.createConformDBpediaURI(splitter[1]); String object = WikiPediaUriConverter.createConformDBpediaURI(splitter[2]) .replaceAll("http://dbpedia.org/resource/", ""); if (!pattymap.containsKey(subject)) { LinkedList<String> list = new LinkedList<String>(); pattymap.put(subject, list); } LinkedList<String> l = pattymap.get(subject); l.add("patty/" + patternMap.get(j) + ":::" + "dbpediaRes/" + object); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } public void fillPattyFreebaseRelationIndex(String pattern, String instance) { File patternFile = new File(pattern); HashMap<Integer, String> patternMap = new HashMap<Integer, String>(); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(patternFile)); reader.readLine(); String line = null; while ((line = reader.readLine()) != null) { String[] splitter = line.split("\\t"); Integer i = null; try { i = new Integer(Integer.valueOf(splitter[0])); } catch (NumberFormatException e) { e.printStackTrace(); } patternMap.put(i, splitter[1]); } } catch (IOException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } // Read Instancefile - either WikiTypes or Freebase Types File instanceFile = new File(instance); reader = null; try { reader = new BufferedReader(new FileReader(instanceFile)); reader.readLine(); String line = null; while ((line = reader.readLine()) != null) { String[] splitter = line.split("\\t"); Integer j = null; try { j = new Integer(Integer.valueOf(splitter[0])); } catch (NumberFormatException e) { e.printStackTrace(); } String subject = WikiPediaUriConverter.createConformDBpediaURI(splitter[1]); String object = WikiPediaUriConverter.createConformDBpediaURI(splitter[2]) .replaceAll("http://dbpedia.org/resource/", ""); if (!pattyfreebasemap.containsKey(subject)) { LinkedList<String> list = new LinkedList<String>(); pattyfreebasemap.put(subject, list); } LinkedList<String> l = pattyfreebasemap.get(subject); l.add("patty/" + patternMap.get(j) + ":::" + "dbpediaRes/" + object); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } public void workLinkText() { File f = new File(LINKTEXT); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(f)); String line = null; while ((line = reader.readLine()) != null) { String split[] = line.split("\\t"); if (split.length > 2) { for (int i = 2; i < split.length; ++i) { String ent = split[i]; String[] occ = ent.split(":"); // Bugfix: Fix Wrong splitting StringBuilder builder = new StringBuilder(); for (int j = 0; j < occ.length - 1; j++) { builder.append(occ[j] + ":"); } String nr = occ[occ.length - 1]; String entity = builder.toString(); entity = entity.substring(0, entity.length() - 1); String uri = WikiPediaUriConverter.createConformDBpediaURI(entity); if (!uri.contains("(Disambiguation)")) { // UniqueLabelStrings if (UNIQUELABELSTRINGS.containsKey(uri)) { HashSet<String> set = UNIQUELABELSTRINGS.get(uri); set.add(split[0].toLowerCase()); addUniqueCandidateWithoutSpecialChars(set, split[0]); } else { HashSet<String> set = new HashSet<String>(); set.add(split[0].toLowerCase()); addUniqueCandidateWithoutSpecialChars(set, split[0]); UNIQUELABELSTRINGS.put(uri, set); } // Occurrences if (!OCCURRENCES.containsKey(uri)) { HashMap<String, Integer> map = new HashMap<String, Integer>(); OCCURRENCES.put(uri, map); } addOccurrence(uri, split[0].toLowerCase(), Integer.valueOf(nr)); } } } } } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } public void getUniqueLabelsFromOldIndex() { File oldIndexFile = new File(OLDINDEX); IndexReader readerOldIndex = null; try { final Directory oldDir = FSDirectory.open(oldIndexFile); readerOldIndex = DirectoryReader.open(oldDir); for (int j = 0; j < readerOldIndex.maxDoc(); ++j) { Document oldDoc = readerOldIndex.document(j); String[] oldUniqueLabels = oldDoc.getValues("UniqueLabelString"); String oldResource = oldDoc.get("Mainlink"); // Transform old to new Namespace oldResource = oldResource.replaceAll("http://dbpedia.org/resource/", ""); oldResource = URLDecoder.decode(oldResource, "UTF-8"); oldResource = WikiPediaUriConverter.createConformDBpediaURI(oldResource); // Old Unique Labels if (UNIQUELABELSTRINGS.containsKey(oldResource)) { HashSet<String> set = UNIQUELABELSTRINGS.get(oldResource); if (oldUniqueLabels != null && oldUniqueLabels.length > 0) { for (int k = 0; k < oldUniqueLabels.length; ++k) { set.add(oldUniqueLabels[k].toLowerCase()); addUniqueCandidateWithoutSpecialChars(set, oldUniqueLabels[k]); } } } else { HashSet<String> set = new HashSet<String>(); if (oldUniqueLabels != null && oldUniqueLabels.length > 0) { for (int k = 0; k < oldUniqueLabels.length; ++k) { set.add(oldUniqueLabels[k].toLowerCase()); addUniqueCandidateWithoutSpecialChars(set, oldUniqueLabels[k]); } } UNIQUELABELSTRINGS.put(oldResource, set); } if (!OCCURRENCES.containsKey(oldResource)) { OCCURRENCES.put(oldResource, new HashMap<String, Integer>()); } String oldOccurrences = oldDoc.get("Occurrences"); if ((oldOccurrences != null) && !oldOccurrences.equalsIgnoreCase("")) { final String[] splitter = oldOccurrences.split(";;;"); for (final String element : splitter) { final String[] splitter1 = element.split(":::"); int check = 1; try { check = Integer.valueOf(splitter1[1]); } catch (final NumberFormatException e) { Logger.getRootLogger().error("Warning NumberFormatException while Initialization: "); } addOccurrence(oldResource, splitter1[0], check); } } } readerOldIndex.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (readerOldIndex != null) { try { readerOldIndex.close(); } catch (IOException e) { e.printStackTrace(); } } } } public void workEntities() { File f = new File(ENTITIES); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(f)); String line = null; while ((line = reader.readLine()) != null) { String split[] = line.split("\\t"); if (split.length > 2) { for (int i = 2; i < split.length; ++i) { String ent = split[i]; String[] occ = ent.split(":"); String uri = WikiPediaUriConverter.createConformDBpediaURI(occ[0]); // Synonyms if (LABELS.containsKey(uri)) { HashSet<String> set = LABELS.get(uri); // Add Label to UniqueLabel addLabelToUniqueLabel(uri, split[0]); set.add(split[0].toLowerCase()); } else { HashSet<String> set = new HashSet<String>(); // set.add(occ[0].toLowerCase()); set.add(split[0].toLowerCase()); // Add Label to UniqueLabel addLabelToUniqueLabel(uri, split[0]); LABELS.put(uri, set); } } } } } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } private void addLabelToUniqueLabel(String entity, String label) { if (UNIQUELABELSTRINGS.containsKey(entity)) { HashSet<String> set = UNIQUELABELSTRINGS.get(entity); set.add(label.toLowerCase()); addUniqueCandidateWithoutSpecialChars(set, label); } } public void workRedirects() { File f = new File(REDIRECTS); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(f)); String line = null; while ((line = reader.readLine()) != null) { String split[] = line.split("\\t"); // Bug Fix of wrong redirects if (split.length < 3) { String uri = WikiPediaUriConverter.createConformDBpediaURI(split[1]); if (LABELS.containsKey(uri)) { HashSet<String> set = LABELS.get(uri); set.add(split[0].toLowerCase()); // Add Label to UniqueLabel addLabelToUniqueLabel(uri, split[0]); } else { HashSet<String> set = new HashSet<String>(); set.add(split[0].toLowerCase()); // Add Label to UniqueLabel addLabelToUniqueLabel(uri, split[0]); LABELS.put(uri, set); } } } } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } public void createNewIndex() { File newIndexFile = new File(NEWINDEX); try { final Directory newDir = FSDirectory.open(newIndexFile); Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>(); analyzerPerField.put("Label", new DoserIDAnalyzer()); analyzerPerField.put("PattyRelations", new DoserIDAnalyzer()); analyzerPerField.put("PattyFreebaseRelations", new DoserIDAnalyzer()); analyzerPerField.put("Relations", new DoserIDAnalyzer()); analyzerPerField.put("Occurrences", new DoserIDAnalyzer()); analyzerPerField.put("Type", new DoserIDAnalyzer()); analyzerPerField.put("StringLabel", new DoserIDAnalyzer()); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField); final IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, aWrapper); final IndexWriter newIndexWriter = new IndexWriter(newDir, config); for (String uri : entities) { Document doc = new Document(); // Add ID doc.add(new StringField("ID", "DBpedia_" + String.valueOf(counter), Store.YES)); counter++; // Add Mainlink doc.add(new StringField("Mainlink", uri, Store.YES)); // Add Labels List<String> origLabels = getDbPediaLabel(uri); HashSet<String> labelset = LABELS.get(uri); if (labelset == null) { labelset = new HashSet<String>(); } for (String s : origLabels) { labelset.add(s); } for (String s : labelset) { doc.add(new TextField("Label", s.toLowerCase(), Store.YES)); doc.add(new StringField("StringLabel", s.toLowerCase(), Store.YES)); } // Add ShortDescriptions String shortDescription = getDbPediaShortDescription(uri); doc.add(new TextField("ShortDescription", shortDescription, Store.YES)); // Add longDescriptions String longDescription = getDbPediaLongDescription(uri); doc.add(new TextField("LongDescription", longDescription, Store.YES)); // Add Type String type = filterStandardDomain(getRDFTypesFromEntity(uri)); doc.add(new StringField("Type", type, Store.YES)); // Add Occurrences HashMap<String, Integer> occs = OCCURRENCES.get(uri); if (uri.equals("http://dbpedia.org/resource/Real_Madrid_C.F.")) { occs.put("real", 5000); } StringBuilder builder = new StringBuilder(); if (occs != null) { for (Map.Entry<String, Integer> entry : occs.entrySet()) { String key = entry.getKey(); int value = entry.getValue(); builder.append(key + ":::" + String.valueOf(value) + ";;;"); } } String occurrenceString = builder.toString(); if (occurrenceString.length() > 0) { occurrenceString = occurrenceString.substring(0, occurrenceString.length() - 3); } doc.add(new StringField("Occurrences", occurrenceString, Store.YES)); // UniqueLabelStrings HashSet<String> keys = UNIQUELABELSTRINGS.get(uri); // Füge noch die Sportsteams hinzu if (keys == null) { keys = new HashSet<String>(); } if (teams.contains(uri)) { keys.addAll(extractSportsTeamNames(labelset, uri)); } // Füge noch weitere Personennamen hinzu // Flip Person Names Vorname <=> Nachname if(type.equalsIgnoreCase("Person")) { keys.addAll(addSomeMorePersonNames(uri)); } keys.addAll(addAdditionalPersonNameOccurrences(uri)); for (String s : origLabels) { keys.add(s.toLowerCase()); addUniqueCandidateWithoutSpecialChars(keys, s); } for (String s : keys) { doc.add(new StringField("UniqueLabel", s, Store.YES)); } // Add DBPedia Facts if (relationmap.containsKey(uri)) { LinkedList<String> l = relationmap.get(uri); builder = new StringBuilder(); if (l != null) { for (String str : l) { builder.append(str); builder.append(";;;"); } } String s = builder.toString(); if (s.length() > 0) { s = s.substring(0, s.length() - 3); } doc.add(new TextField("Relations", s, Store.YES)); } else { doc.add(new TextField("Relations", "", Store.YES)); } // Add PattyFacts if (pattymap.containsKey(uri)) { LinkedList<String> l = pattymap.get(uri); builder = new StringBuilder(); if (l != null) { for (String str : l) { builder.append(str); builder.append(";;;"); } } String s = builder.toString(); if (s.length() > 0) { s = s.substring(0, s.length() - 3); } doc.add(new TextField("PattyRelations", s, Store.YES)); } else { doc.add(new TextField("PattyRelations", "", Store.YES)); } // Add PattyFreebaseFacts if (pattyfreebasemap.containsKey(uri)) { LinkedList<String> l = pattyfreebasemap.get(uri); builder = new StringBuilder(); if (l != null) { for (String str : l) { builder.append(str); builder.append(";;;"); } } String s = builder.toString(); if (s.length() > 0) { s = s.substring(0, s.length() - 3); } doc.add(new TextField("PattyFreebaseRelations", s, Store.YES)); } else { doc.add(new TextField("PattyFreebaseRelations", "", Store.YES)); } // Add DBpediaPriors if (DBPEDIAGRAPHINLINKS.containsKey(uri)) { doc.add(new IntField("DbpediaVertexDegree", DBPEDIAGRAPHINLINKS.get(uri), Field.Store.YES)); } // Add Evidences // if(evidences.containsKey(uri)) { // Set<String> ev = extractEvidences(evidences.get(uri)); // for(String s : ev) { // doc.add(new StringField("Evidence", s, Field.Store.YES)); // } // } // Add DBpedia RDFS Label Occurrences // Set<String> dbpediaoccs = createDBpediaOccs(origLabels); // for (String s : dbpediaoccs) { // doc.add(new StringField("DBpediaUniqueLabel", s, Store.YES)); // } // Write Document To Index if (doc.get("Label") != null && !doc.get("Label").equalsIgnoreCase("")) { newIndexWriter.addDocument(doc); } } newIndexWriter.close(); } catch (IOException e) { e.printStackTrace(); } } private HashSet<String> addSomeMorePersonNames(final String uri) { HashSet<String> names = new HashSet<String>(); try { final String query = "SELECT ?name WHERE{ <" + uri + "> <http://xmlns.com/foaf/0.1/name> ?name. }"; ResultSet results = null; QueryExecution qexec = null; final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query); qexec = QueryExecutionFactory.create(cquery, this.persondata); results = qexec.execSelect(); if (results != null) { while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); final String surname = sol.getLiteral("name").getLexicalForm(); names.add(surname.toLowerCase()); } qexec.close(); } } catch (QueryParseException e) { Logger.getRootLogger().info("Query parse Exception"); } String reducedUri = uri.replaceAll("http://dbpedia.org/resource/", ""); String splitter[] = reducedUri.split("_"); if(splitter.length == 2) { names.add((splitter[0]+" "+splitter[1]).toLowerCase()); names.add((splitter[1]+" "+splitter[0]).toLowerCase()); } return names; } // public Set<String> createDBpediaOccs(List<String> labels) { // Set<String> set = new HashSet<String>(); // for (String s : labels) { // set.add(s.toLowerCase()); // set.add(s.toLowerCase().replaceAll("[^A-Za-z0-9 ]", "")); // String[] splitter = s.split(" "); // if (splitter.length == 2) { // for (int i = 0; i < splitter.length; i++) { // splitter[i] = splitter[i].replaceAll("[^A-Za-z0-9 ]", ""); // } // set.add(splitter[0].toLowerCase()); // set.add(splitter[1].toLowerCase()); // } else if (splitter.length > 2) { // boolean hasKomma = false; // int j = -1; // for (int i = 0; i < splitter.length; i++) { // if (splitter[i].endsWith(",")) { // hasKomma = true; // j = i; // break; // } // } // if (hasKomma) { // StringBuilder builder = new StringBuilder(); // StringBuilder withbuilder = new StringBuilder(); // for (int i = 0; i <= j; ++i) { // builder.append(splitter[i].replaceAll("[^A-Za-z0-9 ]", // "").toLowerCase()); // withbuilder.append(splitter[i].replaceAll(",", "").toLowerCase()); // if (i < j) { // builder.append(" "); // withbuilder.append(" "); // } // } // set.add(builder.toString()); // set.add(withbuilder.toString()); // builder = new StringBuilder(); // withbuilder = new StringBuilder(); // for(int i = j + 1; i < splitter.length; ++i) { // builder.append(splitter[i].replaceAll("[^A-Za-z0-9 ]", // "").toLowerCase()); // withbuilder.append(splitter[i].replaceAll(",", "").toLowerCase()); // System.out.println(i+" "+splitter.length); // if(i < splitter.length - 1) { // System.out.println("JHUUUU"+builder.toString()); // builder.append(" "); // withbuilder.append(" "); // } // } // set.add(builder.toString()); // set.add(withbuilder.toString()); // } // } // // // Das erste Wort //// set.add(splitter[0].toLowerCase()); //// if (splitter.length > 1) { //// // Das letzte Wort //// set.add(splitter[splitter.length - 1].toLowerCase()); //// } // // Abkürzungen // // StringBuilder builderWith = new StringBuilder(); // // StringBuilder builderWithout = new StringBuilder(); // // for(int i = 0; i < splitter.length; ++i) { // // builderWith.append(splitter[i].substring(0, 1)+"."); // // builderWithout.append(splitter[i].substring(0, 1)); // // } // // set.add(builderWith.toString().toLowerCase()); // // if(builderWithout.length() > 1) { // // set.add(builderWithout.toString().toLowerCase()); // // } // // N-Gramme // // NgramIterator ngram = new NgramIterator(2, s); // // while(ngram.hasNext()) { // // set.add(ngram.next().toLowerCase()); // // } // // NgramIterator ngram3 = new NgramIterator(3, s); // // while(ngram3.hasNext()) { // // set.add(ngram3.next().toLowerCase()); // // } // } // return set; // } // // public List<String> getDbPediaLabel(final String uri) // throws QueryException, QueryParseException { public List<String> getDbPediaLabel(final String uri) throws QueryException, QueryParseException { final List<String> labellist = new LinkedList<String>(); try { final String query = "SELECT ?label WHERE{ <" + uri + "> <http://www.w3.org/2000/01/rdf-schema#label> ?label. }"; ResultSet results = null; QueryExecution qexec = null; final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query); qexec = QueryExecutionFactory.create(cquery, this.labelmodel); results = qexec.execSelect(); if (results != null) { while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); final String label = sol.getLiteral("label").getLexicalForm(); labellist.add(label); } qexec.close(); } } catch (QueryParseException e) { Logger.getRootLogger().info("Query parse Exception"); } return labellist; } public String getDbPediaShortDescription(final String uri) throws QueryException, QueryParseException { String labellist = ""; try { final String query = "SELECT ?comment WHERE{ <" + uri + "> <http://www.w3.org/2000/01/rdf-schema#comment> ?comment. }"; ResultSet results = null; QueryExecution qexec = null; final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query); qexec = QueryExecutionFactory.create(cquery, this.shortdescmodel); results = qexec.execSelect(); if (results != null) { while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); String desc = sol.getLiteral("comment").getLexicalForm(); labellist = desc; } qexec.close(); } } catch (QueryParseException e) { Logger.getRootLogger().info("Query parse Exception"); } return labellist; } public String getDbPediaLongDescription(final String uri) throws QueryException, QueryParseException { String labellist = ""; try { final String query = "SELECT ?comment WHERE{ <" + uri + "> <http://dbpedia.org/ontology/abstract> ?comment. }"; ResultSet results = null; QueryExecution qexec = null; final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query); qexec = QueryExecutionFactory.create(cquery, this.longdescmodel); results = qexec.execSelect(); if (results != null) { while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); final String desc = sol.getLiteral("comment").getLexicalForm(); labellist = desc; } qexec.close(); } } catch (QueryParseException e) { Logger.getRootLogger().info("Query parse Exception"); } return labellist; } public void readEntities() { File f = new File(ENTITYLIST); try { String line = null; BufferedReader reader = new BufferedReader(new FileReader(f)); while ((line = reader.readLine()) != null) { String uri = URLDecoder.decode(line, "UTF-8").replaceAll("http://dbpedia.org/resource/", ""); entities.add(WikiPediaUriConverter.createConformDBpediaURI(uri)); } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void insertWebOccurrences() { File dir = new File(WEBOCCURRENCESDIRECTORY); File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { try { BufferedReader reader = new BufferedReader(new FileReader(files[i])); String line = null; while ((line = reader.readLine()) != null) { if (line.startsWith("MENTION")) { String[] splitter = line.split("\\t"); String mention = splitter[1]; String uri = WikiPediaUriConverter .createConformDBpediaURI(splitter[3].replaceAll("http://en.wikipedia.org/wiki/", "")); // System.out.println("Mention: "+mention+" Uri: "+uri); if (UNIQUELABELSTRINGS.containsKey(uri)) { HashSet<String> set = UNIQUELABELSTRINGS.get(uri); set.add(mention.toLowerCase()); } else { HashSet<String> set = new HashSet<String>(); set.add(mention.toLowerCase()); UNIQUELABELSTRINGS.put(uri, set); } } } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } private void addOccurrence(String uri, String sf, int amount) { HashMap<String, Integer> occ = OCCURRENCES.get(uri); if (occ.containsKey(sf)) { int i = occ.get(sf); i += amount; occ.put(sf, i); } else { occ.put(sf, amount); } if (UNIQUELABELSTRINGS.containsKey(uri)) { HashSet<String> set = UNIQUELABELSTRINGS.get(uri); set.add(sf.toLowerCase()); addUniqueCandidateWithoutSpecialChars(set, sf); } else { HashSet<String> set = new HashSet<String>(); set.add(sf.toLowerCase()); addUniqueCandidateWithoutSpecialChars(set, sf); UNIQUELABELSTRINGS.put(uri, set); } } private void addUniqueCandidateWithoutSpecialChars(HashSet<String> set, String sf) { String newsf = sf.toLowerCase().replaceAll("[^a-zA-Z ]", ""); if (newsf.length() > 2) { set.add(newsf); } } public HashSet<String> addAdditionalPersonNameOccurrences(String res) { HashSet<String> names = new HashSet<String>(); try { final String query = "SELECT ?surname WHERE{ <" + res + "> <http://xmlns.com/foaf/0.1/surname> ?surname. }"; ResultSet results = null; QueryExecution qexec = null; final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query); qexec = QueryExecutionFactory.create(cquery, this.persondata); results = qexec.execSelect(); if (results != null) { while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); final String surname = sol.getLiteral("surname").getLexicalForm(); names.add(surname.toLowerCase()); } qexec.close(); } } catch (QueryParseException e) { Logger.getRootLogger().info("Query parse Exception"); } // Constraint dass es eine Person ist if (names.size() > 0) { String rdfslabel = ""; try { final String query = "SELECT ?label WHERE{ <" + res + "> <http://www.w3.org/2000/01/rdf-schema#label> ?label. }"; ResultSet results = null; QueryExecution qexec = null; final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query); qexec = QueryExecutionFactory.create(cquery, this.labelmodel); results = qexec.execSelect(); if (results != null) { while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); final String label = sol.getLiteral("label").getLexicalForm(); rdfslabel = label; } qexec.close(); } } catch (QueryParseException e) { Logger.getRootLogger().info("Query parse Exception"); } String splitter[] = rdfslabel.split(" "); if (splitter.length > 2) { // Generiere verschiedene Namensmöglichkeiten for (int i = 0; i < splitter.length; i++) { for (int j = 0; j < splitter.length; j++) { if (!splitter[i].equalsIgnoreCase(splitter[j])) { names.add((splitter[i] + " " + splitter[j]).toLowerCase()); } } } } } return names; } public void readWikiPageDisambiguation() { Model m = ModelFactory.createDefaultModel(); m.read(DISAMBIGUATIONWIKILINKS); StmtIterator it = m.listStatements(); while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith("http://dbpedia.org/resource/")) { String label = subject.getURI().replaceAll("http://dbpedia.org/resource/", ""); label = label.replaceAll("\\_\\(disambiguation\\)", "").toLowerCase(); if (UNIQUELABELSTRINGS.containsKey(obj.getURI())) { HashSet<String> set = UNIQUELABELSTRINGS.get(obj.getURI()); set.add(label); addUniqueCandidateWithoutSpecialChars(set, label); } else { HashSet<String> set = new HashSet<String>(); set.add(label); addUniqueCandidateWithoutSpecialChars(set, label); UNIQUELABELSTRINGS.put(obj.getURI(), set); } } } } } public void sportsTeamsSurfaceForms() { Model m = ModelFactory.createDefaultModel(); m.read(INSTANCEMAPPINGTYPES_NT); StmtIterator it = m.listStatements(); while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (obj.getURI().equalsIgnoreCase("http://dbpedia.org/ontology/SportsTeam")) { teams.add(subject.getURI()); } } } } private HashSet<String> extractSportsTeamNames(HashSet<String> set, String uri) { HashSet<String> newStringSet = new HashSet<String>(); for (String s : set) { String splitter[] = s.split(" "); for (int i = 0; i < splitter.length; i++) { if (splitter[i].equalsIgnoreCase(splitter[i].replaceAll("[^a-zA-Z ]", ""))) { if (splitter[i].toLowerCase().length() > 3) { newStringSet.add(splitter[i].toLowerCase()); } } } } uri = uri.replaceAll("http://dbpedia.org/resource/", ""); String[] splitter = uri.split("_"); if (splitter.length == 2) { String newuri = "http://dbpedia.org/resource/" + splitter[0]; if (entities.contains(newuri)) { System.out.println("SPORTSTEAM: " + splitter[0].toLowerCase() + " " + uri); newStringSet.add(splitter[0].toLowerCase()); } } else if (splitter.length > 2) { String newuri = "http://dbpedia.org/resource/" + splitter[0]; if (entities.contains(newuri)) { System.out.println("SPORTSTEAM: " + splitter[0].toLowerCase() + " " + uri); newStringSet.add(splitter[0].toLowerCase()); } newuri = "http://dbpedia.org/resource/" + splitter[0] + "_" + splitter[1]; if (entities.contains(newuri)) { String s = splitter[0] + " " + splitter[1]; newStringSet.add(s.toLowerCase()); System.out.println("SPORTSTEAM: " + s.toLowerCase() + " " + uri); } } return newStringSet; } private String filterStandardDomain(Set<String> set) { String res = "Misc"; for (String s : set) { if (s.equalsIgnoreCase("http://dbpedia.org/ontology/Person")) { res = "Person"; break; } else if (s.equalsIgnoreCase("http://dbpedia.org/ontology/Organisation")) { res = "Organisation"; break; } else if (s.equalsIgnoreCase("http://www.ontologydesignpatterns.org/ont/d0.owl#Location")) { res = "Location"; break; } } return res; } public Set<String> getRDFTypesFromEntity(final String entityUri) { Set<String> set = new HashSet<String>(); final String query = "SELECT ?types WHERE{ <" + entityUri + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?types. }"; ResultSet results = null; QueryExecution qexec = null; try { final com.hp.hpl.jena.query.Query cquery = QueryFactory.create(query); qexec = QueryExecutionFactory.create(cquery, instancemappingtypes); results = qexec.execSelect(); } catch (final QueryException e) { Logger.getRootLogger().error(e.getStackTrace()); } finally { if (results != null) { while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); final String type = sol.getResource("types").toString(); set.add(type); } } } return set; } public void addSomeAbbreviations() { for (Map.Entry<String, HashSet<String>> entry : this.UNIQUELABELSTRINGS.entrySet()) { String url = entry.getKey(); HashSet<String> occs = entry.getValue(); String type = filterStandardDomain(getRDFTypesFromEntity(url)); if (type.equals("Location")) { String tempuri = url.replaceAll("http://dbpedia.org/resource/", "").toLowerCase(); tempuri = tempuri.replaceAll("_", " "); StringBuilder builder = new StringBuilder(); String splitter[] = tempuri.split(" "); if (splitter.length > 1) { for (int i = 0; i < splitter.length; i++) { builder.append(splitter[i].substring(0, 1)); builder.append("."); } occs.add(builder.toString()); } } } } public void addAdditionalSurfaceForms() { // Hack for (String s : entities) { if (!urlentitymapping.containsKey(s.toLowerCase())) { urlentitymapping.put(s.toLowerCase(), s); } } File folder = new File(SURFACEFORMDIRECTORY); File[] files = folder.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; try { XMLReader xmlReader = XMLReaderFactory.createXMLReader(); FileReader reader = new FileReader(f); InputSource inputSource = new InputSource(reader); Handler handler = new Handler(); xmlReader.setContentHandler(handler); xmlReader.parse(inputSource); } catch (SAXException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } File dirn3 = new File(SURFACEFORMDIRECTORYN3); File[] n3files = dirn3.listFiles(); for (int i = 0; i < n3files.length; i++) { File f = n3files[i]; try { BufferedReader reader = new BufferedReader(new FileReader(f)); String line = null; String sf = null; String entity = null; while ((line = reader.readLine()) != null) { line = line.trim(); if (line.startsWith("nif:anchorOf")) { String[] splitter = line.split("\""); if (splitter.length > 1) { sf = splitter[1].toLowerCase(); } } if (line.startsWith("itsrdf:taIdentRef")) { String[] splitter = line.split("<"); if (splitter.length > 1) { entity = splitter[1].split(">")[0]; } } if (sf != null && entity != null) { System.out.println("SF: " + sf + " Entity: " + entity); if (UNIQUELABELSTRINGS.containsKey(entity)) { Set<String> strings = UNIQUELABELSTRINGS.get(entity); strings.add(sf); } sf = null; entity = null; } } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } // addCustomSurfaceForm("http://dbpedia.org/resource/Annual_Meetings_of_the_International_Monetary_Fund_and_the_World_Bank_Group", // "annual meetings of the international monetary fund"); addCustomSurfaceForm("http://dbpedia.org/resource/Port_of_Turku", "turku's harbor"); addCustomSurfaceForm("http://dbpedia.org/resource/Rear-end_collision", "rear-ended"); addCustomSurfaceForm("http://dbpedia.org/resource/Song", "ngs b"); addCustomSurfaceForm("http://dbpedia.org/resource/Finnish_sauna", "finnish bathhouses"); addCustomSurfaceForm("http://dbpedia.org/resource/Autonomous_car", "vehicles that can drive themselves"); addCustomSurfaceForm("http://dbpedia.org/resource/Free_association_(psychology)", "free-associative"); addCustomSurfaceForm("http://dbpedia.org/resource/Leaf_shape", "leaf-shaped"); addCustomSurfaceForm("http://dbpedia.org/resource/CSKA_Moscow_Stadium", "arena of CSKA Moscow"); addCustomSurfaceForm("http://dbpedia.org/resource/Capital_of_Germany", "german capital's"); addCustomSurfaceForm("http://dbpedia.org/resource/MSN", "msn network"); addCustomSurfaceForm("http://dbpedia.org/resource/Sprint_Corporation", "sprint communications co"); addCustomSurfaceForm("http://dbpedia.org/resource/Abdelbaset_al-Megrahi", "abdulbasit al-maqrahi"); } private void addCustomSurfaceForm(String url, String sf) { if (UNIQUELABELSTRINGS.containsKey(url)) { Set<String> s = UNIQUELABELSTRINGS.get(url); s.add(sf); } } class Handler implements ContentHandler { private String currentValue; private String surfaceForm; private String entityUrl; Handler() { super(); surfaceForm = new String(""); entityUrl = new String(""); } @Override public void characters(char[] arg0, int arg1, int arg2) throws SAXException { currentValue += new String(arg0, arg1, arg2); } @Override public void endDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void endElement(String arg0, String arg1, String arg2) throws SAXException { if (arg1.equals("SurfaceForm")) { this.surfaceForm = currentValue; } if (arg1.equals("ChosenAnnotation")) { this.entityUrl = currentValue; if (!surfaceForm.equals("") && !entityUrl.equals("")) { entityUrl = entityUrl.trim(); entityUrl = entityUrl.replaceAll("http://en.wikipedia.org/wiki/", ""); surfaceForm = surfaceForm.trim(); entityUrl = WikiPediaUriConverter.createConformDBpediaURI(entityUrl); entityUrl = entityUrl.toLowerCase(); if (urlentitymapping.containsKey(entityUrl)) { HashSet<String> set = UNIQUELABELSTRINGS.get(urlentitymapping.get(entityUrl)); // System.out.println("SurfaceForm: " + // surfaceForm.toLowerCase().replaceAll("_", " ") + " // URL " // + urlentitymapping.get(entityUrl)); if (set != null) { set.add(surfaceForm.toLowerCase().replaceAll("_", " ")); } } } } } @Override public void endPrefixMapping(String arg0) throws SAXException { // TODO Auto-generated method stub } @Override public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(String arg0, String arg1) throws SAXException { // TODO Auto-generated method stub } @Override public void setDocumentLocator(Locator arg0) { // TODO Auto-generated method stub } @Override public void skippedEntity(String arg0) throws SAXException { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(String arg0, String arg1, String arg2, Attributes arg3) throws SAXException { if (arg2.equals("SurfaceForm")) { surfaceForm = ""; entityUrl = ""; } if (arg2.equals("SurfaceForm")) { this.currentValue = ""; } if (arg2.equals("ChosenAnnotation")) { this.currentValue = ""; } } @Override public void startPrefixMapping(String arg0, String arg1) throws SAXException { // TODO Auto-generated method stub } } public static void main(String[] args) { CreateDBpediaIndexV2 index = new CreateDBpediaIndexV2(); // List<String> l = new LinkedList<String>(); // l.add("France Agence-Press"); // System.out.println(index.createDBpediaOccs(l)); System.out.println("Step-1: Load Evidences"); // index.loadEvidences(); System.out.println("Step0: Create DBpediaPriors"); index.createDBpediaPriors(); System.out.println("Step1: Read Sportsteams"); index.sportsTeamsSurfaceForms(); System.out.println("Step2: Read Wikipedia Disambiguation Links"); index.readWikiPageDisambiguation(); System.out.println("Step3: Read Entity List"); index.readEntities(); System.out.println("Step4: DBPediaFacts"); index.fillRelationsIndex(); System.out.println("Step5: DBPediaProperties"); index.fillPropertiesIndex(); System.out.println("Step6: PattyFacts"); index.fillPattyRelationIndex(PATTYWIKIPATTERN, PATTYWIKIINSTANCE); System.out.println("Step7: PattyFreebaseFacts"); index.fillPattyFreebaseRelationIndex(PATTYFREEBASEPATTERN, PATTYFREEBASEINSTANCE); System.out.println("Step8: WorkLinkText"); index.workLinkText(); System.out.println("Step9: ReadOldIndex"); index.getUniqueLabelsFromOldIndex(); System.out.println("Step10: WorkEntities"); index.workEntities(); System.out.println("Step11: WorkRedirects"); index.workRedirects(); System.out.println("Step12: WebOccurrences"); index.insertWebOccurrences(); System.out.println("Step13: CreateSomeAbbreviations"); index.addSomeAbbreviations(); System.out.println("Step15: AddSomeSurfaceForms"); index.addAdditionalSurfaceForms(); System.out.println("Step16: CreateIndex"); index.createNewIndex(); // CreateDBpediaIndexV2 index = new CreateDBpediaIndexV2(); // index.addAdditionalSurfaceForms(); } }