package doc2vec.corpuscreation; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.io.StringReader; import java.util.HashSet; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; //import doser.nlp.NLPTools; import doser.tools.indexcreation.WikiPediaUriConverter; public class CreateD2VCorpus_Wikipedia { // public static final String INDEX = // "/mnt/ssd1/disambiguation/LuceneIndex/Wikipedia_Default_AidaNew/"; public static final String WIKIPEDIAPAGESDIR = "/mnt/storage/zwicklbauer/WikiParse/ger_wiki/dump/plain"; private static String outputFilePath; // private HashSet<String> relevantEntities; public static void main(String[] args) { outputFilePath = args[0]; CreateD2VCorpus_Wikipedia creation = new CreateD2VCorpus_Wikipedia(); creation.action(); } public CreateD2VCorpus_Wikipedia() { super(); // this.relevantEntities = new HashSet<String>(); } public void action() { System.out.println("ExtractRelevantEntities"); // extractRelevantEntities(); System.out.println("CreateOutputFile"); createOutputFile(); } public void createOutputFile() { File outputFile = new File(outputFilePath); PrintWriter writer = null; BufferedReader reader = null; File file = new File(WIKIPEDIAPAGESDIR); File[] files = file.listFiles(); try { writer = new PrintWriter(outputFile); for (int i = 0; i < files.length; i++) { String name = files[i].getName(); String finalLink = WikiPediaUriConverter .createConformDBpediaUrifromEncodedString_German(name.replaceAll(".html", "").replaceAll("'", "%")); // if (relevantEntities.contains(finalLink)) { StringBuilder builder = new StringBuilder(); String content = ""; try { reader = new BufferedReader(new FileReader(files[i])); String line = null; while ((line = reader.readLine()) != null) { content += line; } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } try { XMLReader xmlReader = XMLReaderFactory.createXMLReader(); PlainTextHandler handler = new PlainTextHandler(); InputSource inputSource = new InputSource(new StringReader(content)); xmlReader.setContentHandler(handler); xmlReader.parse(inputSource); builder.append(handler.getDocumentText()); String wikitext = builder.toString(); // wikitext = NLPTools.getInstance() // .performLemmatizationAndStopWordRemoval( // wikitext); wikitext = wikitext.replaceAll("\\.", " "); wikitext = wikitext.replaceAll("\\,", " "); wikitext = wikitext.replaceAll("\\!", " "); wikitext = wikitext.replaceAll("\\?", " "); wikitext = wikitext.replaceAll(" +", " "); if (!wikitext.equalsIgnoreCase("") && !finalLink.equalsIgnoreCase("") && !finalLink.equalsIgnoreCase(" ") && !finalLink.equalsIgnoreCase("http://dbpedia.org/resource/")) { writer.println(finalLink + " " + wikitext); } } catch (SAXException e) { e.printStackTrace(); } } // } } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { writer.close(); } } } // public void extractRelevantEntities() { // File oldIndexFile = new File(INDEX); // IndexReader readerOldIndex = null; // try { // final Directory oldDir = FSDirectory.open(oldIndexFile); // readerOldIndex = DirectoryReader.open(oldDir); // for (int j = 0; j < readerOldIndex.maxDoc(); ++j) { // Document oldDoc = readerOldIndex.document(j); // String ent = oldDoc.get("Mainlink"); // this.relevantEntities.add(ent); // } // readerOldIndex.close(); // } catch (IOException e) { // e.printStackTrace(); // } finally { // if (readerOldIndex != null) { // try { // readerOldIndex.close(); // } catch (IOException e) { // e.printStackTrace(); // } // } // } // } class PlainTextHandler implements ContentHandler { private StringBuffer documentText; public PlainTextHandler() { super(); this.documentText = new StringBuffer(); } @Override public void characters(char[] arg0, int arg1, int arg2) throws SAXException { this.documentText.append(new String(arg0, arg1, arg2)); } @Override public void endDocument() throws SAXException { } @Override public void endElement(String arg0, String arg1, String arg2) throws SAXException { } @Override public void endPrefixMapping(String arg0) throws SAXException { } @Override public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { } @Override public void processingInstruction(String arg0, String arg1) throws SAXException { } @Override public void setDocumentLocator(Locator arg0) { } @Override public void skippedEntity(String arg0) throws SAXException { } @Override public void startDocument() throws SAXException { } @Override public void startElement(String arg0, String arg1, String arg2, Attributes arg3) throws SAXException { } @Override public void startPrefixMapping(String arg0, String arg1) throws SAXException { } public String getDocumentText() { return documentText.toString(); } } }