/* LanguageTool, a natural language style checker * Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.wikipedia.atom; import org.languagetool.JLanguageTool; import org.languagetool.Language; import org.languagetool.dev.wikipedia.LocationHelper; import org.languagetool.dev.wikipedia.PlainTextMapping; import org.languagetool.dev.wikipedia.SwebleWikipediaTextFilter; import org.languagetool.dev.wikipedia.TextMapFilter; import org.languagetool.rules.Rule; import org.languagetool.rules.RuleMatch; import org.languagetool.rules.patterns.AbstractPatternRule; import org.languagetool.tools.ContextTools; import xtc.tree.Location; import javax.xml.stream.XMLStreamException; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.util.*; /** * Check the changes from a Wikipedia Atom feed with LanguageTool, only getting * the errors that have been introduced by that change. * @since 2.4 */ class AtomFeedChecker { private static final int CONTEXT_SIZE = 60; private static final String USER_AGENT = "http://tools.wmflabs.org/languagetool/ bot, contact: naber[@]danielnaber.de"; private final JLanguageTool langTool; private final Language language; private final MatchDatabase matchDatabase; private final TextMapFilter textFilter = new SwebleWikipediaTextFilter(); private final ContextTools contextTools = new ContextTools(); AtomFeedChecker(Language language) throws IOException { this(language, null); } AtomFeedChecker(Language language, DatabaseConfig dbConfig) throws IOException { this(language, dbConfig, null); } AtomFeedChecker(Language language, DatabaseConfig dbConfig, File languageModelDir) throws IOException { this.language = Objects.requireNonNull(language); langTool = new JLanguageTool(language); if (languageModelDir != null) { langTool.activateLanguageModelRules(languageModelDir); } // disable because they create too many false alarms: langTool.disableRule("UNPAIRED_BRACKETS"); langTool.disableRule("EN_UNPAIRED_BRACKETS"); langTool.disableRule("EN_QUOTES"); langTool.disableRule("COMMA_PARENTHESIS_WHITESPACE"); langTool.disableRule("UPPERCASE_SENTENCE_START"); langTool.disableRule("FRENCH_WHITESPACE"); // fr activateCategory("Wikipedia", langTool); disableSpellingRules(langTool); if (dbConfig != null) { matchDatabase = new MatchDatabase(dbConfig.getUrl(), dbConfig.getUser(), dbConfig.getPassword()); } else { matchDatabase = null; } contextTools.setContextSize(CONTEXT_SIZE); contextTools.setErrorMarkerStart("<err>"); contextTools.setErrorMarkerEnd("</err>"); contextTools.setEscapeHtml(false); } private void activateCategory(String categoryName, JLanguageTool langTool) { for (Rule rule : langTool.getAllRules()) { if (rule.getCategory().getName().equals(categoryName)) { System.out.println("Activating " + rule.getId() + " in category " + categoryName); langTool.enableRule(rule.getId()); } } } private void disableSpellingRules(JLanguageTool langTool) { for (Rule rule : langTool.getAllActiveRules()) { if (rule.isDictionaryBasedSpellingRule()) { langTool.disableRule(rule.getId()); System.out.println("Disabled spelling rule: " + rule.getId()); } } } CheckResult runCheck(InputStream feedStream) throws IOException { CheckResult checkResult = checkChanges(feedStream); storeResults(checkResult); return checkResult; } CheckResult runCheck(String url) throws IOException { CheckResult checkResult = checkChanges(new URL(url)); storeResults(checkResult); return checkResult; } private void storeResults(CheckResult checkResult) throws IOException { List<ChangeAnalysis> checkResults = checkResult.getCheckResults(); System.out.println("Check results:"); for (ChangeAnalysis result : checkResults) { List<WikipediaRuleMatch> addedMatches = result.getAddedMatches(); List<WikipediaRuleMatch> removedMatches = result.getRemovedMatches(); if (addedMatches.size() > 0 || removedMatches.size() > 0) { System.out.println("'" + result.getTitle() + "' new and removed matches:"); for (WikipediaRuleMatch match : addedMatches) { System.out.println(" [+] " + getId(match.getRule()) + ": " + match.getErrorContext()); if (matchDatabase != null) { matchDatabase.add(match); } } for (WikipediaRuleMatch match : removedMatches) { System.out.println(" [-] " + getId(match.getRule()) + ": " + match.getErrorContext()); if (matchDatabase != null) { matchDatabase.markedFixed(match); } } String diffLink = "https://" + language.getShortCode() + ".wikipedia.org/w/index.php?title=" + URLEncoder.encode(result.getTitle().replace(" ", "_"), "UTF-8") + "&diff=" + result.getDiffId(); System.out.println(" " + diffLink); } } } private String getId(Rule rule) { if (rule instanceof AbstractPatternRule) { return ((AbstractPatternRule) rule).getFullId(); } else { return rule.getId(); } } CheckResult checkChanges(URL atomFeedUrl) throws IOException { System.out.println("Getting atom feed from " + atomFeedUrl); try (InputStream xml = getXmlStream(atomFeedUrl)) { return checkChanges(xml); } } CheckResult checkChanges(InputStream xml) throws IOException { Date lastDateOfPreviousRun = matchDatabase != null ? matchDatabase.getLatestDate(language) : null; List<ChangeAnalysis> result = new ArrayList<>(); long latestDiffId = 0; int skipCount = 0; try { List<AtomFeedItem> items = new AtomFeedParser().getAtomFeedItems(xml); Collections.reverse(items); // older items must come first so we iterate in the order in which the changes were made printDates(items, lastDateOfPreviousRun); if (matchDatabase != null) { matchDatabase.updateRuleMatchPingDate(language, new Date()); } for (AtomFeedItem item : items) { // Note: this skipping is not always exact: // A resolution of one second may not be enough, considering the amount of changes happening, // but I didn't find an id that's constantly increasing (diff=... often but not always increases) if (lastDateOfPreviousRun != null && (item.getDate().before(lastDateOfPreviousRun) || item.getDate().equals(lastDateOfPreviousRun))) { System.out.println("Skipping " + item.getTitle() + ", date " + item.getDate()); skipCount++; } else { if (matchDatabase != null) { matchDatabase.updateRuleMatchCheckDate(language, item.getDate()); } try { System.out.println("Checking " + item.getTitle() + ", diff #" + item.getDiffId()); List<WikipediaRuleMatch> oldMatches = getMatches(item, item.getOldContent()); List<WikipediaRuleMatch> newMatches = getMatches(item, item.getNewContent()); ChangeAnalysis changeAnalysis = new ChangeAnalysis(item.getTitle(), item.getDiffId(), oldMatches, newMatches); result.add(changeAnalysis); if (item.getDiffId() > latestDiffId) { latestDiffId = item.getDiffId(); } } catch (Exception e) { //noinspection CallToPrintStackTrace e.printStackTrace(); // don't just stop because of Sweble conversion problems etc. } } } } catch (XMLStreamException e) { throw new RuntimeException(e); } if (lastDateOfPreviousRun != null && skipCount == 0) { System.err.println("Warning: no items from the Atom feed were skipped - this means that changes might be missing"); } return new CheckResult(result, latestDiffId); } /** Use for test cases only. */ MatchDatabase getDatabase() { return matchDatabase; } private void printDates(List<AtomFeedItem> items, Date lastDateOfPreviousRun) { if (items.size() > 0) { Date firstDate = items.get(0).getDate(); Date lastDate = items.get(items.size()-1).getDate(); System.out.println("Latest date in database: " + lastDateOfPreviousRun); System.out.println("Dates in Atom Feed: " + firstDate + " - " + lastDate); } } private List<WikipediaRuleMatch> getMatches(AtomFeedItem item, List<String> texts) throws IOException { List<WikipediaRuleMatch> oldMatches = new ArrayList<>(); for (String text : texts) { PlainTextMapping filteredContent = textFilter.filter(text); List<RuleMatch> ruleMatches = langTool.check(filteredContent.getPlainText()); oldMatches.addAll(toWikipediaRuleMatches(text, filteredContent, ruleMatches, item)); } return oldMatches; } private List<WikipediaRuleMatch> toWikipediaRuleMatches(String content, PlainTextMapping filteredContent, List<RuleMatch> ruleMatches, AtomFeedItem item) { List<WikipediaRuleMatch> result = new ArrayList<>(); for (RuleMatch ruleMatch : ruleMatches) { Location fromPos = filteredContent.getOriginalTextPositionFor(ruleMatch.getFromPos() + 1); Location toPos = filteredContent.getOriginalTextPositionFor(ruleMatch.getToPos() + 1); int origFrom = LocationHelper.absolutePositionFor(fromPos, content); int origTo = LocationHelper.absolutePositionFor(toPos, content); String errorContext = contextTools.getContext(origFrom, origTo, content); result.add(new WikipediaRuleMatch(language, ruleMatch, errorContext, item)); } return result; } private InputStream getXmlStream(URL url) throws IOException { URLConnection conn = url.openConnection(); conn.setRequestProperty("User-Agent", USER_AGENT); return conn.getInputStream(); } }