/* LanguageTool, a natural language style checker
* Copyright (C) 2011 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.wikipedia;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.MultiThreadedJLanguageTool;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.patterns.AbstractPatternRule;
import org.languagetool.tools.StringTools;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* Check a Wikipedia page (without spell check), fetching the page via the MediaWiki API.
*/
public class WikipediaQuickCheck {
private static final Pattern WIKIPEDIA_URL_REGEX = Pattern.compile("https?://(..)\\.wikipedia\\.org/wiki/(.*)");
private static final Pattern SECURE_WIKIPEDIA_URL_REGEX = Pattern.compile("https://secure\\.wikimedia\\.org/wikipedia/(..)/wiki/(.*)");
private final File ngramDir;
private final int maxSizeBytes;
private List<String> disabledRuleIds = new ArrayList<>();
public WikipediaQuickCheck() {
this(null, Integer.MAX_VALUE);
}
/**
* @since 3.1
* @param ngramDir directory with sub directories like 'en', 'de' etc that contain '1grams' etc directories with ngram data (Lucene indexes)
*/
public WikipediaQuickCheck(File ngramDir) {
this(ngramDir, Integer.MAX_VALUE);
}
/**
* @since 3.3
* @param ngramDir directory with sub directories like 'en', 'de' etc that contain '1grams' etc directories with ngram data (Lucene indexes)
* @param maxSizeBytes the maximum bytes of XML for the methods that take an URL, longer content will throw an exception
*/
public WikipediaQuickCheck(File ngramDir, int maxSizeBytes) {
this.ngramDir = ngramDir;
this.maxSizeBytes = maxSizeBytes;
}
public String getMediaWikiContent(URL wikipediaUrl) throws IOException {
Language lang = getLanguage(wikipediaUrl);
String pageTitle = getPageTitle(wikipediaUrl);
String apiUrl = "https://" + lang.getShortCode() + ".wikipedia.org/w/api.php?titles="
+ URLEncoder.encode(pageTitle, "utf-8") + "&action=query&prop=revisions&rvprop=content|timestamp&format=xml";
return getContent(new URL(apiUrl));
}
public Language getLanguage(URL url) {
Matcher matcher = getUrlMatcher(url.toString());
return Languages.getLanguageForShortCode(matcher.group(1));
}
public String getPageTitle(URL url) {
Matcher matcher = getUrlMatcher(url.toString());
return matcher.group(2);
}
private Matcher getUrlMatcher(String url) {
Matcher matcher1 = WIKIPEDIA_URL_REGEX.matcher(url);
Matcher matcher2 = SECURE_WIKIPEDIA_URL_REGEX.matcher(url);
if (matcher1.matches()) {
return matcher1;
} else if (matcher2.matches()) {
return matcher2;
}
throw new RuntimeException("URL does not seem to be a valid Wikipedia URL: " + url);
}
public void setDisabledRuleIds(List<String> ruleIds) {
disabledRuleIds = ruleIds;
}
public List<String> getDisabledRuleIds() {
return disabledRuleIds;
}
public MarkupAwareWikipediaResult checkPage(URL url) throws IOException, PageNotFoundException {
return checkPage(url, null);
}
/**
* @since 2.6
*/
public MarkupAwareWikipediaResult checkPage(URL url, ErrorMarker errorMarker) throws IOException, PageNotFoundException {
validateWikipediaUrl(url);
String xml = getMediaWikiContent(url);
if (xml.length() > maxSizeBytes) {
throw new RuntimeException("Sorry, the content at " + url + " is too big - this process has been limited to " + maxSizeBytes +
" bytes, but the content is " + xml.length() + " bytes");
}
MediaWikiContent wikiContent = getRevisionContent(xml);
String content = wikiContent.getContent();
if (content.trim().isEmpty()) {
throw new PageNotFoundException("No content found at '" + url + "'");
}
if (content.toLowerCase().contains("#redirect")) {
throw new PageNotFoundException("No content but redirect found at '" + url + "'");
}
return checkWikipediaMarkup(url, wikiContent, getLanguage(url), errorMarker);
}
MarkupAwareWikipediaResult checkWikipediaMarkup(URL url, MediaWikiContent wikiContent, Language language, ErrorMarker errorMarker) throws IOException {
SwebleWikipediaTextFilter filter = new SwebleWikipediaTextFilter();
PlainTextMapping mapping = filter.filter(wikiContent.getContent());
MultiThreadedJLanguageTool langTool = getLanguageTool(language);
List<AppliedRuleMatch> appliedMatches = new ArrayList<>();
List<RuleMatch> matches;
try {
matches = langTool.check(mapping.getPlainText());
} finally {
langTool.shutdown();
}
int internalErrors = 0;
for (RuleMatch match : matches) {
SuggestionReplacer replacer = errorMarker != null ?
new SuggestionReplacer(mapping, wikiContent.getContent(), errorMarker) :
new SuggestionReplacer(mapping, wikiContent.getContent());
try {
List<RuleMatchApplication> ruleMatchApplications = replacer.applySuggestionsToOriginalText(match);
appliedMatches.add(new AppliedRuleMatch(match, ruleMatchApplications));
} catch (Exception e) {
System.err.println("Failed to apply suggestion for rule match '" + match + "' for URL " + url + ": " + e);
internalErrors++;
}
}
return new MarkupAwareWikipediaResult(wikiContent, appliedMatches, internalErrors);
}
public WikipediaQuickCheckResult checkPage(String plainText, Language lang) throws IOException {
MultiThreadedJLanguageTool langTool = getLanguageTool(lang);
try {
List<RuleMatch> ruleMatches = langTool.check(plainText);
return new WikipediaQuickCheckResult(plainText, ruleMatches, lang.getShortCode());
} finally {
langTool.shutdown();
}
}
public void validateWikipediaUrl(URL wikipediaUrl) {
// will throw exception if URL is not valid:
getUrlMatcher(wikipediaUrl.toString());
}
/**
* @param completeWikiContent the Mediawiki syntax as it comes from the API, including surrounding XML
*/
public String getPlainText(String completeWikiContent) {
MediaWikiContent wikiContent = getRevisionContent(completeWikiContent);
String cleanedWikiContent = removeWikipediaLinks(wikiContent.getContent());
TextMapFilter filter = new SwebleWikipediaTextFilter();
return filter.filter(cleanedWikiContent).getPlainText();
}
/**
* @param completeWikiContent the Mediawiki syntax as it comes from the API, including surrounding XML
*/
public PlainTextMapping getPlainTextMapping(String completeWikiContent) {
MediaWikiContent wikiContent = getRevisionContent(completeWikiContent);
SwebleWikipediaTextFilter filter = new SwebleWikipediaTextFilter();
return filter.filter(wikiContent.getContent());
}
// catches most, not all links ("[[pt:Linux]]", but not "[[zh-min-nan:Linux]]"). Might remove some non-interlanguage links.
String removeWikipediaLinks(String wikiContent) {
// interlanguage links
return wikiContent
.replaceAll("\\[\\[[a-z]{2,6}:.*?\\]\\]", "")
// category links
.replaceAll(
"\\[\\[:?(Category|Categoria|Categoría|Catégorie|Kategorie):.*?\\]\\]", "")
// file links, keeps alt and caption
.replaceAll(
"(File|Fitxer|Fichero|Ficheiro|Fichier|Datei):.*?\\.(png|jpg|svg|jpeg|tiff|gif|PNG|JPG|SVG|JPEG|TIFF|GIF)\\|((thumb|miniatur)\\|)?((right|left)\\|)?", "");
}
private MediaWikiContent getRevisionContent(String completeWikiContent) {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser;
RevisionContentHandler handler = new RevisionContentHandler();
try {
saxParser = factory.newSAXParser();
saxParser.parse(new InputSource(new StringReader(completeWikiContent)), handler);
} catch (Exception e) {
throw new RuntimeException("Could not parse XML: " + completeWikiContent, e);
}
return new MediaWikiContent(handler.getRevisionContent(), handler.getTimestamp());
}
private MultiThreadedJLanguageTool getLanguageTool(Language lang) throws IOException {
MultiThreadedJLanguageTool langTool = new MultiThreadedJLanguageTool(lang);
enableWikipediaRules(langTool);
for (String disabledRuleId : disabledRuleIds) {
langTool.disableRule(disabledRuleId);
}
if (ngramDir != null) {
langTool.activateLanguageModelRules(ngramDir);
}
disableSpellingRules(langTool);
return langTool;
}
private void enableWikipediaRules(JLanguageTool langTool) {
List<Rule> allRules = langTool.getAllRules();
for (Rule rule : allRules) {
if (rule.getCategory().getName().equals("Wikipedia")) {
langTool.enableRule(rule.getId());
}
}
}
private void disableSpellingRules(JLanguageTool languageTool) {
List<Rule> allActiveRules = languageTool.getAllActiveRules();
for (Rule rule : allActiveRules) {
if (rule.isDictionaryBasedSpellingRule()) {
languageTool.disableRule(rule.getId());
}
}
}
private String getContent(URL wikipediaUrl) throws IOException {
try {
HttpURLConnection conn = (HttpURLConnection) wikipediaUrl.openConnection();
conn.setRequestMethod("GET");
conn.setConnectTimeout(30_000);
conn.setReadTimeout(30_000);
conn.connect();
try (InputStream contentStream = (InputStream) conn.getContent()) {
return StringTools.streamToString(contentStream, "UTF-8");
}
} catch (SocketTimeoutException e) {
throw new RuntimeException("Timeout accessing " + wikipediaUrl, e);
}
}
/*public static void mainTest(String[] args) throws IOException {
TextFilter filter = new SwebleWikipediaTextFilter();
String plainText = filter.filter("hallo\n* eins\n* zwei");
System.out.println(plainText);
}*/
public static void main(String[] args) throws IOException, PageNotFoundException {
if (args.length != 1) {
System.out.println("Usage: " + WikipediaQuickCheck.class.getName() + " <url>");
System.exit(1);
}
WikipediaQuickCheck check = new WikipediaQuickCheck();
// URL examples:
//String urlString = "http://de.wikipedia.org/wiki/Angela_Merkel";
//String urlString = "https://de.wikipedia.org/wiki/Benutzer_Diskussion:Dnaber";
//String urlString = "https://secure.wikimedia.org/wikipedia/de/wiki/Gütersloh";
String urlString = args[0];
MarkupAwareWikipediaResult result = check.checkPage(new URL(urlString), new ErrorMarker("***", "***"));
int errorCount = 0;
for (AppliedRuleMatch match : result.getAppliedRuleMatches()) {
RuleMatchApplication matchApplication = match.getRuleMatchApplications().get(0);
RuleMatch ruleMatch = match.getRuleMatch();
Rule rule = ruleMatch.getRule();
System.out.println("");
String message = ruleMatch.getMessage().replace("<suggestion>", "'").replace("</suggestion>", "'");
errorCount++;
System.out.print(errorCount + ". " + message);
if (rule instanceof AbstractPatternRule) {
System.out.println(" (" + ((AbstractPatternRule) rule).getFullId() + ")");
} else {
System.out.println(" (" + rule.getId() + ")");
}
System.out.println(" ..." + matchApplication.getOriginalErrorContext(50).replace("\n", "\\n") + "...");
}
}
class RevisionContentHandler extends DefaultHandler {
private final StringBuilder revisionText = new StringBuilder();
private String timestamp;
private boolean inRevision = false;
@Override
public void startElement(String namespaceURI, String lName,
String qName, Attributes attrs) throws SAXException {
if ("rev".equals(qName)) {
timestamp = attrs.getValue("timestamp");
inRevision = true;
}
}
@Override
public void endElement(String namespaceURI, String sName,
String qName) throws SAXException {
if ("rev".equals(qName)) {
inRevision = false;
}
}
@Override
public void characters(char[] buf, int offset, int len) {
String s = new String(buf, offset, len);
if (inRevision) {
revisionText.append(s);
}
}
public String getRevisionContent() {
return revisionText.toString();
}
public String getTimestamp() {
return timestamp;
}
}
}