/* * Cloud9: A MapReduce Library for Hadoop * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.commoncrawl.util; import info.bliki.htmlcleaner.TagNode; import info.bliki.wiki.filter.ITextConverter; import info.bliki.wiki.filter.PlainTextConverter; import info.bliki.wiki.model.IWikiModel; import info.bliki.wiki.model.ImageFormat; import info.bliki.wiki.model.WikiModel; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.apache.commons.lang.StringEscapeUtils; import org.apache.hadoop.io.WritableUtils; /** * A page from Wikipedia. * * @author Jimmy Lin */ public class WikipediaPage { /** * Start delimiter of the page, which is <<code>page</code>>. */ public static final String XML_START_TAG = "<page>"; /** * End delimiter of the page, which is <<code>/page</code>>. */ public static final String XML_END_TAG = "</page>"; private String page; private String title; private String mId; private int textStart; private int textEnd; private boolean isRedirect; private boolean isDisambig; private boolean isStub; private String language; private WikiModel wikiModel; private PlainTextConverter textConverter; /** * Creates an empty <code>WikipediaPage</code> object. */ public WikipediaPage() { wikiModel = new WikiModel("", ""); textConverter = new PlainTextConverter(); } /** * Deserializes this object. */ public void write(DataOutput out) throws IOException { byte[] bytes = page.getBytes(); WritableUtils.writeVInt(out, bytes.length); out.write(bytes, 0, bytes.length); out.writeUTF(language); } /** * Serializes this object. */ public void readFields(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); byte[] bytes = new byte[length]; in.readFully(bytes, 0, length); WikipediaPage.readPage(this, new String(bytes)); language = in.readUTF(); } /** * Returns the article title (i.e., the docid). */ public String getDocid() { return mId; } public void setLanguage(String language) { this.language = language; } public String getLanguage() { return this.language; } // Explictly remove <ref>...</ref>, because there are screwy things like this: // <ref>[http://www.interieur.org/<!-- Bot generated title -->]</ref> // where "http://www.interieur.org/<!--" gets interpreted as the URL by // Bliki in conversion to text private static final Pattern REF = Pattern.compile("<ref>.*?</ref>"); private static final Pattern LANG_LINKS = Pattern.compile("\\[\\[[a-z\\-]+:[^\\]]+\\]\\]"); private static final Pattern DOUBLE_CURLY = Pattern.compile("\\{\\{.*?\\}\\}"); private static final Pattern URL = Pattern.compile("http://[^ <]+"); // Note, don't capture possible HTML tag private static final Pattern HTML_TAG = Pattern.compile("<[^!][^>]*>"); // Note, don't capture comments private static final Pattern HTML_COMMENT = Pattern.compile("<!--.*?-->", Pattern.DOTALL); private static final String ANCHOR_REF_PATTERN = "</ref>"; private static final String AMPERSAND_PATTERN = "&"; /** * Returns the contents of this page (title + text). */ public String getContent() { String s = getWikiMarkup(); // Bliki doesn't seem to properly handle inter-language links, so remove manually. s = LANG_LINKS.matcher(s).replaceAll(" "); wikiModel.setUp(); s = getTitle() + "\n" + wikiModel.render(textConverter, s); wikiModel.tearDown(); // The way the some entities are encoded, we have to unescape twice. s = StringEscapeUtils.unescapeHtml(StringEscapeUtils.unescapeHtml(s)); s = REF.matcher(s).replaceAll(" "); s = HTML_COMMENT.matcher(s).replaceAll(" "); // Sometimes, URL bumps up against comments e.g., <!-- http://foo.com/--> // Therefore, we want to remove the comment first; otherwise the URL pattern might eat up // the comment terminator. s = URL.matcher(s).replaceAll(" "); s = DOUBLE_CURLY.matcher(s).replaceAll(" "); s = HTML_TAG.matcher(s).replaceAll(" "); return s; } /** * Returns the contents of this page (title + text). */ public String getLinks() { String s = getWikiMarkup(); if (s != null) { // Bliki doesn't seem to properly handle inter-language links, so remove manually. s = LANG_LINKS.matcher(s).replaceAll(" "); wikiModel.setUp(); s = wikiModel.render(new LinkRenderer(), s); wikiModel.tearDown(); return s; } return ""; } public String getDisplayContent() { wikiModel.setUp(); String s = "<h1>" + getTitle() + "</h1>\n" + wikiModel.render(getWikiMarkup()); wikiModel.tearDown(); s = DOUBLE_CURLY.matcher(s).replaceAll(" "); return s; } public String getDisplayContentType() { return "text/html"; } /** * Returns the raw XML of this page. */ public String getRawXML() { return page; } /** * Returns the text of this page. */ public String getWikiMarkup() { if (textStart == -1) return null; return page.substring(textStart + 27, textEnd); } /** * Returns the title of this page. */ public String getTitle() { return title; } /** * Checks to see if this page is a disambiguation page. A * <code>WikipediaPage</code> is either an article, a disambiguation page, * a redirect page, or an empty page. * * @return <code>true</code> if this page is a disambiguation page */ public boolean isDisambiguation() { return isDisambig; } /** * Checks to see if this page is a redirect page. A * <code>WikipediaPage</code> is either an article, a disambiguation page, * a redirect page, or an empty page. * * @return <code>true</code> if this page is a redirect page */ public boolean isRedirect() { return isRedirect; } /** * Checks to see if this page is an empty page. A <code>WikipediaPage</code> * is either an article, a disambiguation page, a redirect page, or an empty * page. * * @return <code>true</code> if this page is an empty page */ public boolean isEmpty() { return textStart == -1; } /** * Checks to see if this article is a stub. Return value is only meaningful * if this page isn't a disambiguation page, a redirect page, or an empty * page. * * @return <code>true</code> if this article is a stub */ public boolean isStub() { return isStub; } /** * Checks to see if this page is an actual article, and not, for example, * "File:", "Category:", "Wikipedia:", etc. * * @return <code>true</code> if this page is an actual article */ public boolean isArticle() { return !(getTitle().startsWith("File:") || getTitle().startsWith("Category:") || getTitle().startsWith("Special:") || getTitle().startsWith("Wikipedia:") || getTitle().startsWith("Wikipedia:") || getTitle().startsWith("Template:") || getTitle().startsWith("Portal:")); } /** * Returns the inter-language link to a specific language (if any). * * @param lang * language * @return title of the article in the foreign language if link exists, * <code>null</code> otherwise */ public String findInterlanguageLink(String lang) { int start = page.indexOf("[[" + lang + ":"); if (start < 0) return null; int end = page.indexOf("]]", start); if (end < 0) return null; // Some pages have malformed links. For example, "[[de:Frances Willard]" // in enwiki-20081008-pages-articles.xml.bz2 has only one closing square // bracket. Temporary solution is to ignore malformed links (instead of // trying to hack around them). String link = page.substring(start + 3 + lang.length(), end); // If a newline is found, it probably means that the link is malformed // (see above comment). Abort in this case. if (link.indexOf("\n") != -1) { return null; } if (link.length() == 0) return null; return link; } public List<String> extractLinkDestinations() { int start = 0; List<String> links = new ArrayList<String>(); while (true) { start = page.indexOf("[[", start); if (start < 0) break; int end = page.indexOf("]]", start); if (end < 0) break; String text = page.substring(start + 2, end); // skip empty links if (text.length() == 0) { start = end + 1; continue; } // skip special links if (text.indexOf(":") != -1) { start = end + 1; continue; } // if there is anchor text, get only article title int a; if ((a = text.indexOf("|")) != -1) { text = text.substring(0, a); } if ((a = text.indexOf("#")) != -1) { text = text.substring(0, a); } // ignore article-internal links, e.g., [[#section|here]] if (text.length() == 0 ) { start = end + 1; continue; } links.add(text.trim()); start = end + 1; } return links; } /** * Reads a raw XML string into a <code>WikipediaPage</code> object. * * @param page * the <code>WikipediaPage</code> object * @param s * raw XML string */ public static void readPage(WikipediaPage page, String s) { page.page = s; // parse out title int start = s.indexOf("<title>"); int end = s.indexOf("</title>", start); page.title = StringEscapeUtils.unescapeHtml(s.substring(start + 7, end)); start = s.indexOf("<id>"); end = s.indexOf("</id>"); page.mId = s.substring(start + 4, end); // parse out actual text of article page.textStart = s.indexOf("<text xml:space=\"preserve\">"); page.textEnd = s.indexOf("</text>", page.textStart); page.isDisambig = s.indexOf("{{disambig", page.textStart) != -1 || s.indexOf("{{Disambig", page.textStart) != -1; page.isRedirect = s.substring(page.textStart + 27, page.textStart + 36).compareTo("#REDIRECT") == 0 || s.substring(page.textStart + 27, page.textStart + 36).compareTo("#redirect") == 0; page.isStub = s.indexOf("stub}}", page.textStart) != -1; } public static class LinkRenderer implements ITextConverter { @Override public void imageNodeToText(TagNode arg0, ImageFormat arg1,Appendable arg2, IWikiModel arg3) throws IOException { } @Override public boolean noLinks() { return false; } protected void nodeToHTML(TagNode node, Appendable resultBuffer, IWikiModel model) throws IOException { String name = node.getName(); if (name.equals("a")) { Map<String, String> tagAtttributes = node.getAttributes(); if (tagAtttributes != null) { String href = (String) tagAtttributes.get("href"); String linkClass = (String) tagAtttributes.get("class"); if (href != null && linkClass != null) { if (linkClass.equalsIgnoreCase("externallink")) { // replace wiki escaped & pattern href = href.replaceAll(AMPERSAND_PATTERN,"&"); // find first instance of ref pattern int refIndex = href.indexOf(ANCHOR_REF_PATTERN); if (refIndex != -1) { href = href.substring(0,refIndex); } resultBuffer.append(href + "\n"); } } } } List<Object> children = node.getChildren(); if (children.size() != 0) { if (children.size() != 0) { nodesToText(children, resultBuffer, model); } } } @Override public void nodesToText(List<? extends Object> nodes, Appendable resultBuffer,IWikiModel model) throws IOException { if (nodes != null && !nodes.isEmpty()) { try { int level = model.incrementRecursionLevel(); if (level > 100) { resultBuffer .append("Error - recursion limit exceeded rendering tags in PlainTextConverter#nodesToText()."); return; } Iterator<? extends Object> childrenIt = nodes.iterator(); while (childrenIt.hasNext()) { Object item = childrenIt.next(); if (item != null) { if (item instanceof List) { nodesToText((List) item, resultBuffer, model); } else if (item instanceof TagNode) { TagNode node = (TagNode) item; nodeToHTML(node, resultBuffer, model); } } } } finally { model.decrementRecursionLevel(); } } } } }