/* LanguageTool, a natural language style checker * Copyright (C) 2011 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.wikipedia; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.languagetool.JLanguageTool; import org.languagetool.Language; import org.languagetool.TextFilter; import org.languagetool.language.German; import org.languagetool.rules.Rule; import org.languagetool.rules.RuleMatch; import org.languagetool.tools.StringTools; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** * Check a Wikipedia page (without spell check), fetching the page via the MediaWiki API. */ public class WikipediaQuickCheck { private static final Pattern WIKIPEDIA_URL_REGEX = Pattern.compile("https?://(..)\\.wikipedia\\.org/wiki/(.*)"); private static final Pattern SECURE_WIKIPEDIA_URL_REGEX = Pattern.compile("https://secure\\.wikimedia\\.org/wikipedia/(..)/wiki/(.*)"); private List<String> disabledRuleIds = new ArrayList<String>(); public String getMediaWikiContent(URL wikipediaUrl) throws IOException { final Language lang = getLanguage(wikipediaUrl); final String pageTitle = getPageTitle(wikipediaUrl); final String apiUrl = "http://" + lang.getShortName() + ".wikipedia.org/w/api.php?titles=" + pageTitle + "&action=query&prop=revisions&rvprop=content&format=xml"; return getContent(new URL(apiUrl)); } public Language getLanguage(URL url) { final Matcher matcher = getUrlMatcher(url.toString()); return Language.getLanguageForShortName(matcher.group(1)); } public String getPageTitle(URL url) { final Matcher matcher = getUrlMatcher(url.toString()); return matcher.group(2); } private Matcher getUrlMatcher(String url) { final Matcher matcher1 = WIKIPEDIA_URL_REGEX.matcher(url); final Matcher matcher2 = SECURE_WIKIPEDIA_URL_REGEX.matcher(url); if (matcher1.matches()) { return matcher1; } else if (matcher2.matches()) { return matcher2; } throw new RuntimeException("URL does not seem to be a valid Wikipedia URL: " + url); } public void setDisabledRuleIds(List<String> ruleIds) { disabledRuleIds = ruleIds; } public List<String> getDisabledRuleIds() { return disabledRuleIds; } public WikipediaQuickCheckResult checkPage(String plainText, Language lang) throws IOException { final JLanguageTool langTool = getLanguageTool(lang); final List<RuleMatch> ruleMatches = langTool.check(plainText); return new WikipediaQuickCheckResult(plainText, ruleMatches, lang.getShortName()); } public void validateWikipediaUrl(URL wikipediaUrl) { // will throw exception if URL is not valid: getUrlMatcher(wikipediaUrl.toString()); } String getPlainText(String completeWikiContent) { final String wikiContent = getRevisionContent(completeWikiContent); final String cleanedWikiContent = removeInterLanguageLinks(wikiContent); final TextFilter filter = new SwebleWikipediaTextFilter(); final String plainText = filter.filter(cleanedWikiContent); return plainText; } // catches most, not all links ("[[pt:Linux]]", but not "[[zh-min-nan:Linux]]"). Might remove some non-interlanguage links. String removeInterLanguageLinks(String wikiContent) { return wikiContent.replaceAll("\\[\\[[a-z]{2,6}:.*?\\]\\]", ""); } private String getRevisionContent(String completeWikiContent) { final SAXParserFactory factory = SAXParserFactory.newInstance(); final SAXParser saxParser; final RevisionContentHandler handler = new RevisionContentHandler(); try { saxParser = factory.newSAXParser(); saxParser.parse(new InputSource(new StringReader(completeWikiContent)), handler); } catch (Exception e) { throw new RuntimeException("Could not parse XML: " + completeWikiContent, e); } return handler.getRevisionContent(); } private JLanguageTool getLanguageTool(Language lang) throws IOException { final JLanguageTool langTool = new JLanguageTool(lang); langTool.activateDefaultPatternRules(); for (String disabledRuleId : disabledRuleIds) { langTool.disableRule(disabledRuleId); } disableSpellingRules(langTool); return langTool; } private void disableSpellingRules(JLanguageTool languageTool) { final List<Rule> allActiveRules = languageTool.getAllActiveRules(); for (Rule rule : allActiveRules) { if (rule.isSpellingRule()) { languageTool.disableRule(rule.getId()); } } } private String getContent(URL wikipediaUrl) throws IOException { final InputStream contentStream = (InputStream) wikipediaUrl.getContent(); return StringTools.streamToString(contentStream, "UTF-8"); } /*public static void mainTest(String[] args) throws IOException { final TextFilter filter = new SwebleWikipediaTextFilter(); final String plainText = filter.filter("hallo\n* eins\n* zwei"); System.out.println(plainText); }*/ public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: " + WikipediaQuickCheck.class.getName() + " <url>"); System.exit(1); } final WikipediaQuickCheck check = new WikipediaQuickCheck(); // URL examples: //final String urlString = "http://de.wikipedia.org/wiki/Angela_Merkel"; //final String urlString = "https://de.wikipedia.org/wiki/Benutzer_Diskussion:Dnaber"; //final String urlString = "https://secure.wikimedia.org/wikipedia/de/wiki/G%C3%BCtersloh"; //final String urlString = "https://secure.wikimedia.org/wikipedia/de/wiki/Benutzer_Diskussion:Dnaber"; final String urlString = args[0]; final URL url = new URL(urlString); final String mediaWikiContent = check.getMediaWikiContent(url); final String plainText = check.getPlainText(mediaWikiContent); final WikipediaQuickCheckResult checkResult = check.checkPage(plainText, new German()); for (RuleMatch ruleMatch : checkResult.getRuleMatches()) { System.out.println(ruleMatch.getMessage()); final String context = StringTools.getContext(ruleMatch.getFromPos(), ruleMatch.getToPos(), checkResult.getText()); System.out.println(context); } } class RevisionContentHandler extends DefaultHandler { private final StringBuilder revisionText = new StringBuilder(); private boolean inRevision = false; @Override public void startElement(final String namespaceURI, final String lName, final String qName, final Attributes attrs) throws SAXException { if ("rev".equals(qName)) { inRevision = true; } } @Override public void endElement(final String namespaceURI, final String sName, final String qName) throws SAXException { if ("rev".equals(qName)) { inRevision = false; } } @Override public void characters(final char[] buf, final int offset, final int len) { final String s = new String(buf, offset, len); if (inRevision) { revisionText.append(s); } } public String getRevisionContent() { return revisionText.toString(); } } }