/* LanguageTool, a natural language style checker * Copyright (C) 2010 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.wikipedia; import java.io.IOException; import java.util.Date; import java.util.List; import org.languagetool.JLanguageTool; import org.languagetool.Language; import org.languagetool.TextFilter; import org.languagetool.rules.RuleMatch; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** * Read the Wikipedia XML dump, check texts with LanguageTool, and * let result be handled in sub classes. */ abstract class BaseWikipediaDumpHandler extends DefaultHandler { protected static final int CONTEXT_SIZE = 50; protected static final String MARKER_START = "<err>"; protected static final String MARKER_END = "</err>"; protected static final String LANG_MARKER = "XX"; protected static final String URL_PREFIX = "http://" + LANG_MARKER + ".wikipedia.org/wiki/"; protected Date dumpDate; protected String langCode; protected int maxErrors = 0; protected int errorCount = 0; private final JLanguageTool languageTool; private int ruleMatchCount = 0; private int maxArticles = 0; private int articleCount = 0; private boolean inText = false; private StringBuilder text = new StringBuilder(); private String title; private TextFilter textFilter = new SwebleWikipediaTextFilter(); //=========================================================== // SAX DocumentHandler methods //=========================================================== protected BaseWikipediaDumpHandler(JLanguageTool languageTool, Date dumpDate, String langCode, Language lang) { this.languageTool = languageTool; this.dumpDate = dumpDate; this.langCode = langCode; textFilter = TextFilterTools.getTextFilter(lang); } public void setMaximumArticles(int maxArticles) { this.maxArticles = maxArticles; } public void setMaximumErrors(int maxErrors) { this.maxErrors = maxErrors; } int getArticleCount() { return articleCount; } int getRuleMatchCount() { return ruleMatchCount; } @Override @SuppressWarnings("unused") public void startElement(String namespaceURI, String lName, String qName, Attributes attrs) throws SAXException { if (qName.equals("title")) { inText = true; } else if (qName.equals("text")) { inText = true; } } @Override @SuppressWarnings("unused") public void endElement(String namespaceURI, String sName, String qName) { if (qName.equals("title")) { title = text.toString(); text = new StringBuilder(); } else if (qName.equals("text")) { final String textToCheck = textFilter.filter(text.toString()); if (!textToCheck.contains("#REDIRECT")) { try { articleCount++; if (maxArticles > 0 && articleCount > maxArticles) { throw new ArticleLimitReachedException(maxArticles); } final List<RuleMatch> ruleMatches = languageTool.check(textToCheck); System.out.println("Checking article " + articleCount + " (" + textToCheck.length()/1024 + "KB, '" + title + "')" + ", found " + ruleMatches.size() + " matches"); try { handleResult(title, ruleMatches, textToCheck, languageTool.getLanguage()); } catch (ErrorLimitReachedException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } ruleMatchCount += ruleMatches.size(); } catch (IOException e) { throw new RuntimeException(e); } } text = new StringBuilder(); } inText = false; } @Override public void characters(char buf[], int offset, int len) { final String s = new String(buf, offset, len); if (inText) { text.append(s); } } protected abstract void handleResult(String title, List<RuleMatch> ruleMatches, String text, Language language) throws Exception; protected abstract void close(); }