/* LanguageTool, a natural language style checker * Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.dumpcheck; import org.languagetool.Language; import org.languagetool.dev.wikipedia.SwebleWikipediaTextFilter; import org.languagetool.tokenizers.Tokenizer; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.XMLEvent; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; import java.util.regex.Pattern; /** * Provides access to the sentences of a Wikipedia XML dump. Note that * conversion exceptions are logged to STDERR and are otherwise ignored. * * To get an XML dump, download {@code pages-articles.xml.bz2} from * <a href="http://download.wikimedia.org/backup-index.html">http://download.wikimedia.org/backup-index.html</a>, e.g. * {@code http://download.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2}. * @since 2.4 */ public class WikipediaSentenceSource extends SentenceSource { private static final boolean ONLY_ARTICLES = false; private static final String ARTICLE_NAMESPACE = "0"; private final SwebleWikipediaTextFilter textFilter = new SwebleWikipediaTextFilter(); private final XMLEventReader reader; private final Tokenizer sentenceTokenizer; private final List<WikipediaSentence> sentences; private final Language language; private int articleCount = 0; private int namespaceSkipCount = 0; private int redirectSkipCount = 0; public WikipediaSentenceSource(InputStream xmlInput, Language language) { this(xmlInput, language, null); } /** @since 3.0 */ public WikipediaSentenceSource(InputStream xmlInput, Language language, Pattern filter) { super(language, filter); textFilter.enableMapping(false); // improves performance try { System.setProperty("jdk.xml.totalEntitySizeLimit", String.valueOf(Integer.MAX_VALUE)); // see https://github.com/dbpedia/extraction-framework/issues/487 XMLInputFactory factory = XMLInputFactory.newInstance(); reader = factory.createXMLEventReader(xmlInput); sentenceTokenizer = language.getSentenceTokenizer(); sentences = new ArrayList<>(); this.language = language; } catch (XMLStreamException e) { throw new RuntimeException(e); } } @Override public boolean hasNext() { try { fillSentences(); } catch (XMLStreamException e) { throw new RuntimeException(e); } return sentences.size() > 0; } @Override public Sentence next() { try { fillSentences(); if (sentences.size() == 0) { throw new NoSuchElementException(); } WikipediaSentence wikiSentence = sentences.remove(0); String url = "http://" + language.getShortCode() + ".wikipedia.org/wiki/" + wikiSentence.title; return new Sentence(wikiSentence.sentence, getSource(), wikiSentence.title, url, wikiSentence.articleCount); } catch (XMLStreamException e) { throw new RuntimeException(e); } } @Override public String getSource() { return "wikipedia"; } private void fillSentences() throws XMLStreamException { String title = null; String namespace = null; while (sentences.size() == 0 && reader.hasNext()) { XMLEvent event = reader.nextEvent(); if (event.getEventType() == XMLStreamConstants.START_ELEMENT) { String elementName = event.asStartElement().getName().getLocalPart(); switch (elementName) { case "title": event = reader.nextEvent(); title = event.asCharacters().getData(); articleCount++; break; case "ns": event = reader.nextEvent(); namespace = event.asCharacters().getData(); break; case "text": handleTextElement(namespace, title, articleCount); break; } } } } private void handleTextElement(String namespace, String title, int articleCount) throws XMLStreamException { if (ONLY_ARTICLES && !ARTICLE_NAMESPACE.equals(namespace)) { namespaceSkipCount++; return; } //System.out.println(articleCount + " (nsSkip:" + namespaceSkipCount + ", redirectSkip:" + redirectSkipCount + "). " + title); XMLEvent event = reader.nextEvent(); StringBuilder sb = new StringBuilder(); while (event.isCharacters()) { sb.append(event.asCharacters().getData()); event = reader.nextEvent(); } try { if (sb.toString().trim().toLowerCase().startsWith("#redirect")) { redirectSkipCount++; return; } String textToCheck = textFilter.filter(sb.toString()).getPlainText(); for (String sentence : sentenceTokenizer.tokenize(textToCheck)) { if (acceptSentence(sentence)) { sentences.add(new WikipediaSentence(sentence, title, articleCount)); } } } catch (Exception e) { System.err.println("Could not extract text, skipping document: " + e + ", full stacktrace follows:"); e.printStackTrace(); } } private static class WikipediaSentence { final String sentence; final String title; final int articleCount; WikipediaSentence(String sentence, String title, int articleCount) { this.sentence = sentence; this.title = title; this.articleCount = articleCount; } } }