/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.wikipedia;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.rules.Rule;
import org.xml.sax.SAXException;
/**
* Command-line tool that checks texts from Wikipedia (download "pages-articles.xml.bz2" from
* http://download.wikimedia.org/backup-index.html, e.g.
* http://download.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2)
* and stores the result in a database.
*
* @author Daniel Naber
*/
public class CheckWikipediaDump {
private CheckWikipediaDump() {
// no public constructor
}
public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {
final CheckWikipediaDump prg = new CheckWikipediaDump();
ensureCorrectUsageOrExit(args);
File propFile = null;
if (!"-".equals(args[0])) {
propFile = new File(args[0]);
if (!propFile.exists() || propFile.isDirectory()) {
throw new IOException("File not found or isn't a file: " + propFile.getAbsolutePath());
}
}
final String languageCode = args[2];
final Set<String> disabledRuleIds = new HashSet<String>();
if (!"-".equals(args[1])) {
final File disabledRulesPropFile = new File(args[1]);
if (!disabledRulesPropFile.exists() || disabledRulesPropFile.isDirectory()) {
throw new IOException("File not found or isn't a file: " + disabledRulesPropFile.getAbsolutePath());
}
final Properties disabledRules = new Properties();
disabledRules.load(new FileInputStream(disabledRulesPropFile));
addDisabledRules("all", disabledRuleIds, disabledRules);
addDisabledRules(languageCode, disabledRuleIds, disabledRules);
}
final int maxArticles = Integer.parseInt(args[5]);
final int maxErrors = Integer.parseInt(args[6]);
String[] ruleIds = null;
if (!"-".equals(args[4])) {
ruleIds = args[4].split(",");
}
prg.run(propFile, disabledRuleIds, languageCode, args[3], ruleIds, maxArticles, maxErrors);
}
private static void addDisabledRules(String languageCode, Set<String> disabledRuleIds, Properties disabledRules) {
final String disabledRulesString = (String)disabledRules.get(languageCode);
if (disabledRulesString != null) {
final String[] ids = disabledRulesString.split(",");
disabledRuleIds.addAll(Arrays.asList(ids));
}
}
private static void ensureCorrectUsageOrExit(String[] args) {
if (args.length != 7) {
System.err.println("Usage: CheckWikipediaDump <propertyFile> <rulePropertyFile> <language> <filename> <ruleIds> <maxArticles> <maxErrors>");
System.err.println(" propertyFile a file to set database access properties. Use '-' to print results to stdout.");
System.err.println(" rulePropertyFile a file to set rules which should be disabled per language (e.g. en=RULE1,RULE2 or all=RULE3,RULE4). Use '-' to ignore.");
System.err.println(" language language code like 'en' or 'de'");
System.err.println(" filename path to unpacked Wikipedia XML dump;");
System.err.println(" dumps are available from http://dumps.wikimedia.org/backup-index.html");
System.err.println(" ruleIds comma-separated list of rule-ids to activate. Use '-' to activate the default rules.");
System.err.println(" maxArticles maximum number of articles to check, 0 for no limit");
System.err.println(" maxErrors stop when reaching this many errors, 0 for no limit");
System.exit(1);
}
}
private void run(File propFile, Set<String> disabledRules, String langCode, String xmlFileName, String[] ruleIds, int maxArticles, int maxErrors)
throws IOException, SAXException, ParserConfigurationException {
final File file = new File(xmlFileName);
if (!file.exists() || !file.isFile()) {
throw new IOException("File doesn't exist or isn't a file: " + xmlFileName);
}
final Language lang = Language.getLanguageForShortName(langCode);
final JLanguageTool languageTool = new JLanguageTool(lang);
languageTool.activateDefaultPatternRules();
if (ruleIds != null) {
enableSpecifiedRules(ruleIds, languageTool);
} else {
applyRuleDeactivation(languageTool, disabledRules);
}
disableSpellingRules(languageTool);
final Date dumpDate = getDumpFileDate(file);
System.out.println("Dump date: " + dumpDate + ", language: " + langCode);
System.out.println("Article limit: " + (maxArticles > 0 ? maxArticles : "no limit"));
System.out.println("Error limit: " + (maxErrors > 0 ? maxErrors : "no limit"));
BaseWikipediaDumpHandler xmlHandler = null;
try {
if (propFile != null) {
xmlHandler = new DatabaseDumpHandler(languageTool, dumpDate, langCode, propFile, lang);
} else {
xmlHandler = new OutputDumpHandler(languageTool, dumpDate, langCode, lang);
}
xmlHandler.setMaximumArticles(maxArticles);
xmlHandler.setMaximumErrors(maxErrors);
final SAXParserFactory factory = SAXParserFactory.newInstance();
final SAXParser saxParser = factory.newSAXParser();
saxParser.parse(file, xmlHandler);
} catch (ErrorLimitReachedException e) {
System.out.println(e);
} catch (ArticleLimitReachedException e) {
System.out.println(e);
} finally {
if (xmlHandler != null) {
final float matchesPerDoc = (float)xmlHandler.getRuleMatchCount() / xmlHandler.getArticleCount();
System.out.printf(lang + ": %d total matches\n", xmlHandler.getRuleMatchCount());
System.out.printf(lang + ": ΓΈ%.2f rule matches per document\n", matchesPerDoc);
xmlHandler.close();
}
}
}
private void enableSpecifiedRules(String[] ruleIds, JLanguageTool languageTool) {
for (Rule rule : languageTool.getAllRules()) {
languageTool.disableRule(rule.getId());
}
for (String ruleId : ruleIds) {
languageTool.enableRule(ruleId);
}
for (Rule rule : languageTool.getAllRules()) {
if (rule.isDefaultOff()) {
languageTool.enableDefaultOffRule(rule.getId());
}
}
for (String ruleId : ruleIds) {
boolean found = false;
for (Rule rule : languageTool.getAllRules()) {
if (rule.getId().equals(ruleId)) {
found = true;
break;
}
}
if (!found) {
System.out.println("WARNING: Could not find rule '" + ruleId + "'");
}
}
System.out.println("Only these rules are enabled: " + Arrays.toString(ruleIds));
}
private void applyRuleDeactivation(JLanguageTool languageTool, Set<String> disabledRules) throws IOException {
// disabled via config file, usually to avoid too many false alarms:
for (String disabledRuleId : disabledRules) {
languageTool.disableRule(disabledRuleId);
}
System.out.println("These rules are disabled: " + languageTool.getDisabledRules());
}
private void disableSpellingRules(JLanguageTool languageTool) {
final List<Rule> allActiveRules = languageTool.getAllActiveRules();
for (Rule rule : allActiveRules) {
if (rule.isSpellingRule()) {
languageTool.disableRule(rule.getId());
}
}
System.out.println("All spelling rules are disabled");
}
private Date getDumpFileDate(File file) throws IOException {
final String filename = file.getName();
final String[] parts = filename.split("-");
if (parts.length < 3) {
throw new IOException("Unexpected filename format: " + file.getName() + ", must be like ??wiki-????????-pages-articles.xml");
}
final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
try {
return sdf.parse(parts[1]);
} catch (ParseException e) {
throw new IOException("Unexpected date format '" + parts[1] + "', must be yyyymmdd", e);
}
}
}