package doser.tools.indexcreation; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.LinkedList; import org.apache.lucene.document.Document; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import doser.lucene.analysis.DoserIDAnalyzer; /** * Enriches the doser Lucene Index with Patty Facts * * @author stefan * */ public class AddPattyFactsToIndex { public static final String FACTFIELD = "PattyFreebaseTypesFacts"; public static final String OLDINDEX = "/mnt/ssd1/disambiguation/MMapLuceneIndexStandard/"; public static final String NEWINDEX = "/home/zwicklbauer/NewIndexTryout"; public static void main(String[] args) { int annotatedEntities = 0; HashMap<Integer, String> patternMap = new HashMap<Integer, String>(); HashMap<String, LinkedList<String>> map = new HashMap<String, LinkedList<String>>(); // Read Pattern File - either standard WikiTypes or Freebase File patternFile = new File(args[1]); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(patternFile)); reader.readLine(); String line = null; while ((line = reader.readLine()) != null) { String[] splitter = line.split("\\t"); Integer i = null; try { i = new Integer(Integer.valueOf(splitter[0])); } catch (NumberFormatException e) { e.printStackTrace(); } patternMap.put(i, splitter[1]); } } catch (IOException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } //Read Instancefile - either WikiTypes or Freebase Types File instanceFile = new File(args[0]); reader = null; try { reader = new BufferedReader(new FileReader(instanceFile)); reader.readLine(); String line = null; while ((line = reader.readLine()) != null) { String[] splitter = line.split("\\t"); Integer j = null; try { j = new Integer(Integer.valueOf(splitter[0])); } catch (NumberFormatException e) { e.printStackTrace(); } String subject = splitter[1].replaceAll(" ", "_"); String object = splitter[2].replaceAll(" ", "_"); if (!map.containsKey(subject)) { LinkedList<String> list = new LinkedList<String>(); map.put(subject, list); } LinkedList<String> l = map.get(subject); l.add("pattyWikiTypes/" + patternMap.get(j) + ":::" + "dbpediaRes/" + object); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } // Write Facts to Lucene Index File oldIndexFile = new File(OLDINDEX); File newIndexFile = new File(NEWINDEX); try { final Directory oldDir = FSDirectory.open(oldIndexFile); final Directory newDir = FSDirectory.open(newIndexFile); final IndexWriterConfig config = new IndexWriterConfig( Version.LATEST, new DoserIDAnalyzer()); final IndexReader readerOldIndex = DirectoryReader.open(oldDir); final IndexWriter newIndexWriter = new IndexWriter(newDir, config); int numDocs = readerOldIndex.maxDoc(); for (int i = 0; i < numDocs; i++) { Document doc = readerOldIndex.document(i); /////////////////////////////////////////////////////////////////////////////////////////////// // doc.removeField("Relations"); // String facts = doc.get("Facts"); // doc.add(new TextField("DBpediaFacts", facts, Store.YES)); // doc.removeField("Facts"); /////////////////////////////////////////////////////////////////////////////////////////////// String docurl = doc.get("Mainlink"); LinkedList<String> l = map.get(docurl.replaceAll( "http://dbpedia.org/resource/", "")); StringBuilder builder = new StringBuilder(); if (l != null) { for (String str : l) { builder.append(str); builder.append(";;;"); } } String s = builder.toString(); if (s.length() > 0) { s = s.substring(0, s.length() - 3); } if(s != null && s.length() > 1) { annotatedEntities++; } doc.add(new TextField(FACTFIELD, s, Store.YES)); newIndexWriter.addDocument(doc); } readerOldIndex.close(); newIndexWriter.close(); } catch (IOException e) { e.printStackTrace(); } System.out.println("Summary: "); System.out.println("HashSize: "+map.size()); System.out.println("Annotated Entities: "+annotatedEntities); } }