/* * WikiPreprocessor.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.annotation.preprocessing; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Vector; import java.util.regex.*; import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument.RegionTag; import org.wikipedia.miner.model.*; /** * This class prepares documents in MediaWiki markup format so that they can be tagged by a document tagger. * * @author David Milne */ public class WikiPreprocessor extends DocumentPreprocessor { private final Wikipedia wikipedia ; /** * Initializes a new WikiPreprocessor. This will treat all section headers (==header==) as separate regions, and * will ban all topics that have already been linked to in the markup. * * @param wikipedia an instance of wikipedia */ public WikiPreprocessor(Wikipedia wikipedia) { super(null, null, Pattern.compile("={2,}([^=]+)={2,}") ) ; this.wikipedia = wikipedia ; } @Override public PreprocessedDocument preprocess(String content) { StringBuffer context = new StringBuffer() ; ArrayList<RegionTag> regionTags = getRegionTags(content) ; HashSet<Integer> bannedTopics = new HashSet<Integer>() ; String temp = blankTemplates(content) ; temp = blankTables(temp) ; temp = blankLinks(temp, context, bannedTopics) ; temp = blankSectionHeaders(temp, context) ; temp = clearAllMentions("(?s)\\<\\!\\-\\-(.*?)\\-\\-\\>", temp) ; //strip comments temp = clearAllMentions("<ref\\\\>", temp) ; //remove simple ref tags temp = clearAllMentions("(?s)<ref>(.*?)</ref>", temp) ; //remove ref tags and all content between them. temp = clearAllMentions("(?s)<ref\\s(.*?)>(.*?)</ref>", temp) ; //remove ref tags and all content between them (with attributes). temp = clearAllMentions("<(.*?)>", temp) ; // remove remaining html tags ; temp = clearAllMentions("\\[(http|www)(.*?)\\]", temp) ; //remove external links ; temp = clearAllMentions("'{2,}", temp) ; //remove all bold and italic markup ; temp = clearAllMentionsRetainFirstCharacter("\n:+", temp) ; //remove indents. temp = clearAllMentionsRetainFirstCharacter("\n([\\*\\#]+)", temp) ; //remove list markers. temp = clearAllMentions("&\\w{2,6};", temp) ; // remove entities return new PreprocessedDocument(content, temp, context.toString(), regionTags, bannedTopics) ; } private String blankSectionHeaders(String markup, StringBuffer context) { Pattern p = Pattern.compile("(={2,})([^=]+)\\1") ; Matcher m = p.matcher(markup) ; int lastPos = 0 ; StringBuilder sb = new StringBuilder() ; while(m.find()) { sb.append(markup.substring(lastPos, m.start())) ; sb.append(getSpaceString(m.group().length())) ; String title = m.group(2).trim() ; if (!title.equalsIgnoreCase("see also") && !title.equalsIgnoreCase("external links") && !title.equalsIgnoreCase("references") && !title.equalsIgnoreCase("further reading")) context.append("\n").append(title) ; lastPos = m.end() ; } sb.append(markup.substring(lastPos)) ; return sb.toString() ; } private String blankTemplates(String markup) { List<Integer> templateStack = new ArrayList<Integer>() ; Pattern p = Pattern.compile("(\\{\\{|\\}\\})") ; Matcher m = p.matcher(markup) ; StringBuilder sb = new StringBuilder() ; int lastIndex = 0 ; while (m.find()) { String tag = markup.substring(m.start(), m.end()) ; if (tag.equals("{{")) templateStack.add(m.start()) ; else { if (!templateStack.isEmpty()) { int templateStart = templateStack.size()-1; templateStack.remove(templateStack.size()-1) ; if (templateStack.isEmpty()) { sb.append(markup.substring(lastIndex, templateStart)) ; //we have the whole template, with other templates nested inside for (int i=templateStart; i<m.end() ; i++) sb.append(" ") ; lastIndex = m.end() ; } } } } if (!templateStack.isEmpty()) System.err.println("WikiPreprocessor | Warning: templates were not well formed, so we cannot guarantee that they were stripped out correctly. ") ; sb.append(markup.substring(lastIndex)) ; return sb.toString() ; } private String blankTables(String markup) { List<Integer> tableStack = new ArrayList<Integer>() ; Pattern p = Pattern.compile("(\\{\\||\\|\\})") ; Matcher m = p.matcher(markup) ; StringBuilder sb = new StringBuilder() ; int lastIndex = 0 ; while (m.find()) { String tag = markup.substring(m.start(), m.end()) ; if (tag.equals("{|")) tableStack.add(m.start()) ; else { if (!tableStack.isEmpty()) { int templateStart = tableStack.size()-1 ; tableStack.remove(tableStack.size()-1) ; if (tableStack.isEmpty()) { sb.append(markup.substring(lastIndex, templateStart)) ; for (int i=templateStart; i<m.end() ; i++) sb.append(" ") ; lastIndex = m.end() ; } } } } if (!tableStack.isEmpty()) System.err.println("WikiPreprocessor | Warning: tables were not well formed, so we cannot guarantee that they were stripped out correctly. ") ; sb.append(markup.substring(lastIndex)) ; return sb.toString() ; } private String blankLinks(String markup, StringBuffer context, HashSet<Integer> bannedTopics) { List<Integer> linkStack = new ArrayList<Integer>() ; Pattern p = Pattern.compile("(\\[\\[|\\]\\])") ; Matcher m = p.matcher(markup) ; StringBuilder sb = new StringBuilder() ; int lastIndex = 0 ; while (m.find()) { String tag = markup.substring(m.start(), m.end()) ; if (tag.equals("[[")) linkStack.add(m.start()) ; else { if (!linkStack.isEmpty()) { int linkStart = linkStack.size()-1 ; linkStack.remove(linkStack.size()-1) ; if (linkStack.isEmpty()) { sb.append(markup.substring(lastIndex, linkStart)) ; //we have the whole link, possibly with other links nested inside. for (int i=linkStart; i<m.end() ; i++) sb.append(" ") ; processLink(markup.substring(linkStart+2, m.start()), context, bannedTopics) ; lastIndex = m.end() ; } } } } if (!linkStack.isEmpty()) { System.err.println("WikiPreprocessor| Warning: links were not well formed, so we cannot guarantee that they were stripped out correctly. ") ; } sb.append(markup.substring(lastIndex)) ; return sb.toString() ; } private String clearAllMentionsRetainFirstCharacter(String regex, String text) { Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE + Pattern.DOTALL) ; Matcher m = p.matcher(text) ; int lastPos = 0 ; StringBuilder sb = new StringBuilder() ; while(m.find()) { sb.append(text.substring(lastPos, m.start())) ; sb.append(text.charAt(m.start())) ; for (int i=1 ; i <m.group().length() ; i++) sb.append(" ") ; lastPos = m.end() ; } sb.append(text.substring(lastPos)) ; return sb.toString() ; } private void processLink(String markup, StringBuffer context, HashSet<Integer> bannedTopics) { //ignore everything that is not in main namespace if (markup.indexOf(":") > 0) return ; String anchor = markup ; String dest = markup ; int pos = markup.lastIndexOf("|") ; if (pos>0) { anchor = markup.substring(pos+1) ; dest = markup.substring(0, pos) ; } context.append("\n").append(anchor) ; Article art = wikipedia.getArticleByTitle(dest) ; if (art != null) { bannedTopics.add(art.getId()) ; } } }