package doc2vec.corpuscreation; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.io.StringReader; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import com.google.gson.Gson; import doc2vec.corpuscreation.ExtractContextOfWikipediaPages.Handler.Entry; import doc2vec.corpuscreation.ExtractContextOfWikipediaPages.Handler.Entry.Entity; import doser.tools.indexcreation.WikiPediaUriConverter; public class ExtractContextOfWikipediaPages { public static int CHARSTOMOVE = 200; public static final String INDEX = "/home/zwicklbauer/NewIndexTryout"; private HashSet<String> relevantEntities; public static void main(String[] args) { ExtractContextOfWikipediaPages p = new ExtractContextOfWikipediaPages(); p.extractRelevantEntities(); try { p.doAction(args[0], args[1]); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public ExtractContextOfWikipediaPages() { super(); this.relevantEntities = new HashSet<String>(); } public void extractRelevantEntities() { File oldIndexFile = new File(INDEX); IndexReader readerOldIndex = null; try { final Directory oldDir = FSDirectory.open(oldIndexFile); readerOldIndex = DirectoryReader.open(oldDir); for (int j = 0; j < readerOldIndex.maxDoc(); ++j) { Document oldDoc = readerOldIndex.document(j); String ent = oldDoc.get("Mainlink"); this.relevantEntities.add(ent); } readerOldIndex.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (readerOldIndex != null) { try { readerOldIndex.close(); } catch (IOException e) { e.printStackTrace(); } } } } public void doAction(String maindir, String outputfile) throws FileNotFoundException, IOException { File outputFile = new File(outputfile); PrintWriter pWriter = new PrintWriter(new FileWriter(outputFile, true)); File d = new File(maindir); String[] files = d.list(); Gson gson = new Gson(); for (int i = 0; i < files.length; i++) { File cFile = new File(maindir + files[i]); String content = ""; try { BufferedReader reader = new BufferedReader( new FileReader(cFile)); String line = null; while ((line = reader.readLine()) != null) { content += line; } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } try { XMLReader xmlReader = XMLReaderFactory.createXMLReader(); Handler handler = new Handler(); InputSource inputSource = new InputSource(new StringReader( content)); xmlReader.setContentHandler(handler); xmlReader.parse(inputSource); Entry entry = handler.getEntry(); printObject(pWriter, entry, gson); pWriter.flush(); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } pWriter.close(); } private void printObject(PrintWriter writer, Entry entry, Gson gson) { List<Entity> l = entry.getEntitySet(); for (Entity e : l) { int position = e.getPosition(); String text = entry.getText(); int start = position - CHARSTOMOVE; start = getCorrectLeftWordBound(text, start); int end = position + CHARSTOMOVE; end = getCorrectRightWordBound(text, end); if (end > text.length()) { end = text.length() - 1; } try { String content = text.substring(start, position) + " " + e.getMention() + " " + text.substring(position + e.getMention().length(), end); Output out = new Output(); content.replaceAll(" +", ""); out.setContent(content); out.setUrl(e.getLink()); out.setMention(e.getMention()); // System.out.println(out.getEntity()); if (relevantEntities.contains(out.getEntity())) { writer.println(gson.toJson(out, Output.class)); } } catch (StringIndexOutOfBoundsException exception) { break; } } } private static int getCorrectLeftWordBound(String text, int pos) { if (pos < 0) { pos = 0; } while (true) { if (pos > 0) { char[] chars = { text.charAt(pos) }; String t = new String(chars); if (t.equalsIgnoreCase(" ")) { ++pos; break; } pos--; } else { break; } } return pos; } private static int getCorrectRightWordBound(String text, int pos) { if (pos > text.length()) { pos = text.length(); } while (true) { if (pos < text.length()) { char[] chars = { text.charAt(pos) }; String t = new String(chars); if (t.equalsIgnoreCase(" ")) { break; } pos++; } else { break; } } return pos; } public class Output { private String entity; private String content; private String mention; public String getEntity() { return entity; } public void setUrl(String url) { this.entity = url; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getMention() { return mention; } public void setMention(String mention) { this.mention = mention; } } class Handler implements ContentHandler { private String currentValue; private Entry entry; private StringBuffer documentText; private String link; private String mention; private int position; public Handler() { super(); this.currentValue = ""; this.entry = new Entry(); this.documentText = new StringBuffer(); this.link = null; this.mention = null; this.position = 0; } public Entry getEntry() { return this.entry; } @Override public void characters(char[] arg0, int arg1, int arg2) throws SAXException { this.currentValue = new String(arg0, arg1, arg2); this.documentText.append(new String(arg0, arg1, arg2)); } @Override public void endDocument() throws SAXException { this.entry.setText(documentText.toString()); } @Override public void endElement(String arg0, String arg1, String arg2) throws SAXException { if (arg1.equals("a")) { this.mention = currentValue; entry.addEntity(this.mention, this.link, this.position); this.mention = null; this.link = null; } } @Override public void endPrefixMapping(String arg0) throws SAXException { // TODO Auto-generated method stub } @Override public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(String arg0, String arg1) throws SAXException { // TODO Auto-generated method stub } @Override public void setDocumentLocator(Locator arg0) { // TODO Auto-generated method stub } @Override public void skippedEntity(String arg0) throws SAXException { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(String arg0, String arg1, String arg2, Attributes arg3) throws SAXException { if (arg1.equals("a")) { this.link = arg3.getValue("href"); this.position = documentText.length(); } } @Override public void startPrefixMapping(String arg0, String arg1) throws SAXException { // TODO Auto-generated method stub } class Entry { class Entity { private int position; private String link; private String mention; public int getPosition() { return position; } public void setPosition(int position) { this.position = position; } public String getLink() { return link; } public void setLink(String link) { String withoutending = link.replaceAll(".html", ""); String finalLink = WikiPediaUriConverter .createConformDBpediaUrifromEncodedString(withoutending); this.link = finalLink; } public String getMention() { return mention; } public void setMention(String mention) { this.mention = mention; } } private List<Entity> entitySet; private String text; public Entry() { super(); this.entitySet = new LinkedList<Entity>(); } public List<Entity> getEntitySet() { return entitySet; } public void setEntitySet(List<Entity> entitySet) { this.entitySet = entitySet; } public String getText() { return text; } public void setText(String text) { this.text = text; } public void addEntity(String mention, String link, int position) { Entity e = new Entity(); e.setLink(link); e.setMention(mention); e.setPosition(position); this.entitySet.add(e); } } } }