/* * HtmlPreprocessor.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.annotation.preprocessing; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument.RegionTag; /** * This class prepares html documents so that they can be tagged by a document tagger. * * @author David Milne */ public class HtmlPreprocessor extends DocumentPreprocessor{ static String[] defaultRegionTags = {"div", "table"} ; static String[] defaultSplitterTags = {"h1", "h2"} ; /** * Creates an preprocessor for html documents, with the default region tags. It will only treat divs and tables as regions, and * will split regions on h1 and h2 tags. It will not ban any topics. */ public HtmlPreprocessor() { super(getStartTagRegex(defaultRegionTags), getEndTagRegex(defaultRegionTags), getTagRegex(defaultSplitterTags)) ; } /** * Creates an preprocessor for html documents, that will recognize the given tags. When specifying tags, just give the name. eg "div", not "<div>" ; * * @param regionTags specifies which tags to treat as regions. e.g. "div" and "table" * @param splitterTags specifies which tags to treat as region splitters. e.g. "h1" and "h2" */ public HtmlPreprocessor(String[] regionTags, String[] splitterTags) { super(getStartTagRegex(regionTags), getEndTagRegex(regionTags), getTagRegex(splitterTags)) ; } /** * Takes the marked-up content and replaces all tags with blank spaces. * Everything before the body tag is also replaced with blanks. * * @param content the html to be preprocessed * @return the preprocessedString */ @Override public PreprocessedDocument preprocess(final String content) { StringBuilder context = new StringBuilder() ; String temp = content.toLowerCase() ; ArrayList<RegionTag> regionTags = getRegionTags(temp) ; //find point where body starts int bodyStart = temp.indexOf("<body") ; if (bodyStart < 0) bodyStart = 0 ; //System.out.println(temp.substring(0, bodyStart)) ; //add title to context Pattern p = Pattern.compile("<title([^>]*)>(.*?)</title>", Pattern.DOTALL) ; Matcher m = p.matcher(temp.substring(0, bodyStart)) ; while (m.find()) { context.append(m.group(2)) ; context.append(".\n") ; } //add metadata to context p = Pattern.compile("<meta(.*?)/>", Pattern.DOTALL) ; m = p.matcher(temp.substring(0, bodyStart)) ; while (m.find()) { String tag = m.group() ; String tName = getAttributeValue(tag, "name") ; String tContent = getAttributeValue(tag, "content") ; if ((tName.equals("keywords") || tName.equals("description")) && !tContent.equals("")) { context.append(tContent) ; context.append("\n") ; } } temp = content.substring(bodyStart) ; // process links, adding anchors to context p = Pattern.compile("<a([^>]*)>(.*?)</a>", Pattern.DOTALL) ; m = p.matcher(temp) ; int lastPos = 0 ; StringBuilder sbTemp = new StringBuilder() ; while(m.find()) { sbTemp.append(temp.substring(lastPos, m.start())) ; sbTemp.append(getSpaceString(m.group().length())) ; //links may contain other tags, lets get down to the raw text. String linkContent = clearAllMentions("<(.*?)>", m.group(2)).trim() ; if (!linkContent.equals("")) { context.append(linkContent) ; context.append(".\n") ; } lastPos = m.end() ; } sbTemp.append(temp.substring(lastPos)) ; temp = sbTemp.toString() ; // process comments temp = clearAllMentions("<!--(.*?)-->", temp) ; // process scripts temp = clearAllMentions("<script(.*?)</script>", temp) ; // process remaining tags temp = clearAllMentions("<(.*?)>", temp) ; // process entities temp = clearAllMentions("&\\w{2,6};", temp) ; return new PreprocessedDocument(content, getSpaceString(bodyStart)+temp, context.toString(), regionTags, null) ; } private String getAttributeValue(String tag, String attributeName) { Pattern p = Pattern.compile(attributeName + "\\W*=\\W*\"(.*?)\"", Pattern.DOTALL) ; Matcher m = p.matcher(tag) ; if (m.find()) return m.group(1) ; else return "" ; } /** * Convenience method for generating patterns that will match all opening tags of the given types * * @param tags the types of tags of interest * @return a regular expression that will match all opening tags of the given types */ public static Pattern getStartTagRegex(String[] tags) { if (tags == null || tags.length == 0) return null ; if (tags.length == 1) return Pattern.compile("<" + tags[0] + "[^>]*>", Pattern.CASE_INSENSITIVE) ; StringBuilder regex = new StringBuilder() ; for (String tag:tags) regex.append(tag).append("|") ; regex.deleteCharAt(regex.length()-1) ; return Pattern.compile("<(" + regex.toString() + ")[^>]*>", Pattern.CASE_INSENSITIVE) ; } /** * Convenience method for generating patterns that will match all closing tags of the given types * * @param tags the types of tags of interest * @return a regular expression that will match all closing tags of the given types */ public static Pattern getEndTagRegex(String[] tags) { if (tags == null || tags.length == 0) return null ; if (tags.length == 1) return Pattern.compile("</" + tags[0] + "[^>]*>", Pattern.CASE_INSENSITIVE) ; StringBuilder regex = new StringBuilder() ; for (String tag:tags) regex.append(tag).append("|") ; regex.deleteCharAt(regex.length()-1) ; return Pattern.compile("</(" + regex.toString() + ")[^>]*>", Pattern.CASE_INSENSITIVE) ; } /** * Convenience method for generating patterns that will match all tags (opening, closing, singular) of the given types * * @param tags the types of tags of interest * @return a regular expression that will match all tags of the given types */ public static Pattern getTagRegex(String[] tags) { if (tags == null || tags.length == 0) return null ; if (tags.length == 1) return Pattern.compile("</*" + tags[0] + "[^>]*>", Pattern.CASE_INSENSITIVE) ; StringBuilder regex = new StringBuilder() ; for (String tag:tags) regex.append(tag).append("|") ; regex.deleteCharAt(regex.length()-1) ; return Pattern.compile("</*(" + regex.toString() + ")[^>]*>", Pattern.CASE_INSENSITIVE) ; } }