package it.sauronsoftware.feed4j.html; import java.io.StringReader; import org.apache.html.dom.HTMLDocumentImpl; import org.cyberneko.html.parsers.DOMFragmentParser; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; import org.w3c.dom.html.HTMLDocument; import org.xml.sax.InputSource; /** * HTML code optimizer. It analyzes a HTML fragment fixing common mistakes and * removing those attributes and tags usually useless for a feed reader. * * @author Carlo Pelliccia */ public class HTMLOptimizer { /** * An attribute structure. * * @author Carlo Pelliccia * */ private static class Attribute { public String name; public boolean required; public Attribute(String name, boolean required) { this.name = name; this.required = required; } } /** * A tag structure. * * @author Carlo Pelliccia * */ private static class Tag { public String name; public Attribute[] attributes; public Tag(String name, Attribute[] attributes) { this.name = name; this.attributes = attributes; } } /** * The approved tags list, each element with its approved attributes. The * residue will be removed. */ private static final Tag[] TAGS = new Tag[] { new Tag("strong", new Attribute[0]), new Tag("cite", new Attribute[0]), new Tag("em", new Attribute[0]), new Tag("p", new Attribute[0]), new Tag("div", new Attribute[0]), new Tag("br", new Attribute[0]), new Tag("ul", new Attribute[0]), new Tag("ol", new Attribute[0]), new Tag("li", new Attribute[0]), new Tag("table", new Attribute[0]), new Tag("tr", new Attribute[0]), new Tag("td", new Attribute[0]), new Tag("th", new Attribute[0]), new Tag("img", new Attribute[] { new Attribute("src", true), new Attribute("width", false), new Attribute("height", false), new Attribute("alt", false), new Attribute("title", false) }), new Tag("a", new Attribute[] { new Attribute("href", true), new Attribute("title", false) }) }; /** * This method analyzes a HTML fragment fixing common mistakes and removing * those attributes and tags usually useless for a feed reader. * * @param html * The HTML fragment. * @return The fixed HTML fragment. */ public static String optimize(String html) { // Parsa l'HTML. DOMFragmentParser parser = new DOMFragmentParser(); HTMLDocument document = new HTMLDocumentImpl(); DocumentFragment fragment = document.createDocumentFragment(); try { parser.parse(new InputSource(new StringReader(html)), fragment); } catch (Exception e) { return null; } // Esegue le ottimizzazioni, ricodifica come stringa e restituisce. String ret = fromNodeToString(fragment).toString(); return ret; } private static StringBuffer fromNodeToString(Node node) { StringBuffer buffer = new StringBuffer(); if (node instanceof Element) { Element element = (Element) node; buffer.append(internal(element)); } else if (node instanceof Text) { Text text = (Text) node; buffer.append(HTMLEntities.encode(text.getNodeValue())); } else { NodeList list = node.getChildNodes(); for (int i = 0; i < list.getLength(); i++) { Node current = list.item(i); buffer.append(fromNodeToString(current)); } } return buffer; } private static StringBuffer internal(Element el) { StringBuffer buffer = new StringBuffer(); String tagname = el.getNodeName().toLowerCase(); Tag tag = recognizeTag(tagname, el.getAttributes()); if (tag != null) { buffer.append('<'); buffer.append(tagname); buffer.append(recognizeAttributes(el, tag)); } StringBuffer buffer2 = new StringBuffer(); NodeList list = el.getChildNodes(); for (int i = 0; i < list.getLength(); i++) { buffer2.append(fromNodeToString(list.item(i))); } if (tag != null) { if (buffer2.length() == 0) { buffer.append(' '); buffer.append('/'); buffer.append('>'); } else { buffer.append('>'); buffer.append(buffer2); buffer.append('<'); buffer.append('/'); buffer.append(tagname); buffer.append('>'); } } else { if (buffer2.length() > 0) { buffer.append(buffer2); } } return buffer; } private static Tag recognizeTag(String tagName, NamedNodeMap attrs) { for (int i = 0; i < TAGS.length; i++) { if (TAGS[i].name.equals(tagName)) { Attribute[] aux = TAGS[i].attributes; for (int j = 0; j < aux.length; j++) { if (aux[j].required) { boolean found = false; for (int k = 0; k < attrs.getLength(); k++) { Node aux2 = attrs.item(k); String name = aux2.getNodeName().toLowerCase(); String value = aux2.getNodeValue(); if (name.equalsIgnoreCase(aux[j].name) && value != null && value.length() > 0) { found = true; break; } } if (!found) { return null; } } } return TAGS[i]; } } return null; } private static StringBuffer recognizeAttributes(Element element, Tag tag) { StringBuffer buffer = new StringBuffer(); NamedNodeMap attrs = element.getAttributes(); for (int k = 0; k < attrs.getLength(); k++) { Node attr = attrs.item(k); String attrName = attr.getNodeName().toLowerCase(); String attrValue = attr.getNodeValue(); boolean found = false; for (int w = 0; w < tag.attributes.length; w++) { if (attrName.equals(tag.attributes[w].name)) { found = true; break; } } if (found) { if (attrValue != null && attrValue.length() > 0) { buffer.append(' '); buffer.append(attrName); buffer.append('='); buffer.append('"'); buffer.append(HTMLEntities.encode(attrValue)); buffer.append('"'); } } } return buffer; } }