package doser.tools.indexcreation; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.StringReader; import java.util.LinkedList; import java.util.List; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.Version; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import doser.tools.indexcreation.CreateWikipediaDocumentCentricKB.Handler.Entry; import doser.tools.indexcreation.CreateWikipediaDocumentCentricKB.Handler.Entry.Entity; /** * Creates a document-centric knowledge base out of the extracted Wikipedia * pages (see S1HtmlToPlainTextWithEntities). * * @author quh * */ public class CreateWikipediaDocumentCentricKB { public CreateWikipediaDocumentCentricKB() { super(); } public void create(String documentDirectory, String luceneOutput) { StandardAnalyzer analyzer = new StandardAnalyzer(); try { MMapDirectory dir = new MMapDirectory(new File(luceneOutput)); IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer); IndexWriter writer = new IndexWriter(dir, config); File wikipediaFiles = new File(documentDirectory); File[] files = wikipediaFiles.listFiles(); for (int i = 0; i < files.length; i++) { String content = ""; try { BufferedReader reader = new BufferedReader(new FileReader( files[i])); String line = null; while ((line = reader.readLine()) != null) { content += line; } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } Entry entry = null; try { XMLReader xmlReader = XMLReaderFactory.createXMLReader(); Handler handler = new Handler(); InputSource inputSource = new InputSource(new StringReader( content)); xmlReader.setContentHandler(handler); xmlReader.parse(inputSource); entry = handler.getEntry(); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } if (entry != null) { Document doc = new Document(); doc.add(new StringField("Title", files[i].getName(), Field.Store.YES)); doc.add(new TextField("Text", entry.text, Field.Store.YES)); StringBuffer buffer = generateAnnotatedEntityStrings(entry .getEntitySet()); doc.add(new StringField("Entities", buffer.toString(), Field.Store.YES)); writer.addDocument(doc); } } writer.close(); } catch (IOException e) { e.printStackTrace(); } } private StringBuffer generateAnnotatedEntityStrings(List<Entity> entities) { StringBuffer buffer = new StringBuffer(); for (Entity entity : entities) { buffer.append(entity.getLink()); } return buffer; } class Handler implements ContentHandler { private String currentValue; private Entry entry; private StringBuffer documentText; private String link; private String mention; private int position; public Handler() { super(); this.currentValue = ""; this.entry = new Entry(); this.documentText = new StringBuffer(); this.link = null; this.mention = null; this.position = 0; } public Entry getEntry() { return this.entry; } @Override public void characters(char[] arg0, int arg1, int arg2) throws SAXException { this.currentValue = new String(arg0, arg1, arg2); this.documentText.append(new String(arg0, arg1, arg2)); } @Override public void endDocument() throws SAXException { this.entry.setText(documentText.toString()); } @Override public void endElement(String arg0, String arg1, String arg2) throws SAXException { if (arg1.equals("a")) { this.mention = currentValue; entry.addEntity(this.mention, this.link, this.position); this.mention = null; this.link = null; } } @Override public void endPrefixMapping(String arg0) throws SAXException { // TODO Auto-generated method stub } @Override public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(String arg0, String arg1) throws SAXException { // TODO Auto-generated method stub } @Override public void setDocumentLocator(Locator arg0) { // TODO Auto-generated method stub } @Override public void skippedEntity(String arg0) throws SAXException { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(String arg0, String arg1, String arg2, Attributes arg3) throws SAXException { if (arg1.equals("a")) { this.link = arg3.getValue("href"); this.position = documentText.length(); } } @Override public void startPrefixMapping(String arg0, String arg1) throws SAXException { // TODO Auto-generated method stub } class Entry { class Entity { private int position; private String link; private String mention; public int getPosition() { return position; } public void setPosition(int position) { this.position = position; } public String getLink() { return link; } /* * Bugfix! Links that were redirections in Step 1 miss ".html". */ public void setLink(String link) { if (!link.endsWith(".html")) { this.link = link + ".html"; } else { this.link = link; } } public String getMention() { return mention; } public void setMention(String mention) { this.mention = mention; } } private List<Entity> entitySet; private String text; public Entry() { super(); this.entitySet = new LinkedList<Entity>(); } public List<Entity> getEntitySet() { return entitySet; } public void setEntitySet(List<Entity> entitySet) { this.entitySet = entitySet; } public String getText() { return text; } public void setText(String text) { this.text = text; } public void addEntity(String mention, String link, int position) { Entity e = new Entity(); e.setLink(link); e.setMention(mention); e.setPosition(position); this.entitySet.add(e); } } } public static void main(String[] args) { CreateWikipediaDocumentCentricKB kb = new CreateWikipediaDocumentCentricKB(); kb.create(args[0], args[1]); } }