package word2vec.corpuscreation; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.io.StringReader; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import doser.tools.indexcreation.WikiPediaUriConverter; public class CreateEntityCorpus { public static final boolean SEPERATELINES = true; public static final String CORPUSFILE = "/home/zwicklbauer/word2vec/corpus/wikientitycorpus.dat"; public static final String DIRECTORY = "/mnt/storage/zwicklbauer/WikiParse/temp/plain_reduced"; public CreateEntityCorpus() { super(); } public void createCorpus() throws FileNotFoundException { PrintWriter writer = new PrintWriter(new File(CORPUSFILE)); File directory = new File(DIRECTORY); File[] files = directory.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; String filecontent = ""; try { BufferedReader reader = new BufferedReader(new FileReader(f)); String line = null; while ((line = reader.readLine()) != null) { filecontent += line; } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } try { XMLReader xmlReader = XMLReaderFactory.createXMLReader(); Handler handler = new Handler(); InputSource inputSource = new InputSource(new StringReader( filecontent)); xmlReader.setContentHandler(handler); xmlReader.parse(inputSource); String fileText = handler.getString(); print(writer, fileText); if(SEPERATELINES) { printNewLine(writer); } } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } class Handler implements ContentHandler { private StringBuilder builder; Handler() { super(); builder = new StringBuilder(); } @Override public void setDocumentLocator(Locator locator) { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void endDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { // TODO Auto-generated method stub } @Override public void endPrefixMapping(String prefix) throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { if (localName.equals("a")) { builder.append(WikiPediaUriConverter .createConformDBpediaUriEndingfromEncodedString(atts .getValue("href")) + " "); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { // TODO Auto-generated method stub } @Override public void characters(char[] ch, int start, int length) throws SAXException { String val = new String(ch, start, length); if(val.contains(".")) { builder.append(". "); } else if(val.contains("?")) { builder.append("? "); } else if(val.contains("!")) { builder.append("! "); } } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(String target, String data) throws SAXException { // TODO Auto-generated method stub } @Override public void skippedEntity(String name) throws SAXException { // TODO Auto-generated method stub } public String getString() { return builder.toString(); } } public void print(PrintWriter writer, String text) { writer.print(text); writer.flush(); } public void printNewLine(PrintWriter writer) { writer.print(System.lineSeparator()); } public static void main(String[] args) { CreateEntityCorpus corpus = new CreateEntityCorpus(); try { corpus.createCorpus(); } catch (FileNotFoundException e) { e.printStackTrace(); } } }