/****************************************************************************** * Copyright (c) 2010 Basis Technology Corp. * * Basis Technology Corp. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.basistech.readability; import java.util.HashMap; import java.util.Map; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; public class HtmlPage extends XmlDataMap { public static final String KEY = "htmlPage"; static Map<String, ElementAction> elementActionMap; static { elementActionMap = new HashMap<String, ElementAction>(); elementActionMap.put("img", ElementAction.Alt); elementActionMap.put("applet", ElementAction.Alt); elementActionMap.put("area", ElementAction.Alt); elementActionMap.put("input", ElementAction.Alt); elementActionMap.put("script", ElementAction.Banned); elementActionMap.put("iframe", ElementAction.Banned); elementActionMap.put("style", ElementAction.Banned); elementActionMap.put("br", ElementAction.Whitespace); elementActionMap.put("p", ElementAction.Sentence); elementActionMap.put("hr", ElementAction.Sentence); elementActionMap.put("ul", ElementAction.Sentence); elementActionMap.put("h1", ElementAction.Sentence); elementActionMap.put("h2", ElementAction.Sentence); elementActionMap.put("h3", ElementAction.Sentence); elementActionMap.put("h4", ElementAction.Sentence); elementActionMap.put("h5", ElementAction.Sentence); elementActionMap.put("h6", ElementAction.Sentence); elementActionMap.put("pre", ElementAction.Sentence); elementActionMap.put("blockquote", ElementAction.Sentence); elementActionMap.put("title", ElementAction.Sentence); elementActionMap.put("div", ElementAction.Sentence); // hmm, span tags with CSS with certain properties? Hopeless. elementActionMap.put("center", ElementAction.Whitespace); elementActionMap.put("form", ElementAction.Sentence); elementActionMap.put("table", ElementAction.Sentence); elementActionMap.put("td", ElementAction.Sentence); elementActionMap.put("th", ElementAction.Sentence); elementActionMap.put("li", ElementAction.Sentence); elementActionMap.put("dir", ElementAction.Sentence); elementActionMap.put("menu", ElementAction.Sentence); elementActionMap.put("ol", ElementAction.Sentence); } // the data as formatted for RLP -- just the PC-DATA. private String pcData; private String mimeType; public HtmlPage() { super(); } public void process(Document document) { Element body = document.body(); if (body != null) { // page might have no body. process(body); pcData = pcDataBuffer.toString(); } } public String getPcData() { return pcData; } @Override protected ElementAction classifyElement(Element element) { if (element.hasAttr("basisInline")) { return null; } return elementActionMap.get(element.tagName()); } public String getMimeType() { return mimeType; } public void setMimeType(String mimeType) { this.mimeType = mimeType; } }