/** * Copyright (c) 2009-2014 Câmara dos Deputados. Todos os direitos reservados. * * e-Democracia é um software livre; você pode redistribuí-lo e/ou modificá-lo dentro * dos termos da Licença Pública Geral Menor GNU como publicada pela Fundação do * Software Livre (FSF); na versão 2.1 da Licença, ou (na sua opinião) qualquer versão. * * Este programa é distribuído na esperança de que possa ser útil, mas SEM NENHUMA GARANTIA; * sem uma garantia implícita de ADEQUAÇÃO a qualquer MERCADO ou APLICAÇÃO EM PARTICULAR. * Veja a Licença Pública Geral Menor GNU para maiores detalhes. */ package br.gov.camara.edemocracia.util; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import net.htmlparser.jericho.Attribute; import net.htmlparser.jericho.CharacterReference; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.EndTagType; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.HTMLElements; import net.htmlparser.jericho.OutputDocument; import net.htmlparser.jericho.Segment; import net.htmlparser.jericho.Source; import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; import net.htmlparser.jericho.Tag; /** * @author robson */ public class HtmlStripper { private static final Set<String> ALLOWED_TAGS = new HashSet<String>(Arrays.asList( HTMLElementName.BR, HTMLElementName.P, HTMLElementName.EM, HTMLElementName.STRONG, HTMLElementName.I, HTMLElementName.U, HTMLElementName.SPAN, HTMLElementName.OL, HTMLElementName.UL, HTMLElementName.LI, HTMLElementName.A)); private static final Set<String> ALLOWED_ATTRIBUTES = new HashSet<String>(Arrays.asList("alt", "href", "target")); private static final Object VALID_MARKER = new Object(); private final Set<String> allowedTags; private final Set<String> allowedAttributes; public HtmlStripper() { allowedTags = new HashSet<String>(ALLOWED_TAGS); allowedAttributes = new HashSet<String>(ALLOWED_ATTRIBUTES); } /** * Retira tags indesejadas * * @param html * @return */ public String strip(String html) { if (html == null) return ""; Source source = new Source(html); source.fullSequentialParse(); OutputDocument output = new OutputDocument(source); List<Tag> tags = source.getAllTags(); int pos = 0; for (Tag tag : tags) { if (processTag(tag, output)) { tag.setUserData(VALID_MARKER); } else { output.remove(tag); } reencodeTextSegment(source, output, pos, tag.getBegin()); pos = tag.getEnd(); } reencodeTextSegment(source, output, pos, source.getEnd()); return output.toString(); } //retorna referência para o próprio objeto para permitir "method-chaining" /** * Adds an allowed tag name * @param tagName tag name without < or > * @return this. For method-chaining */ public HtmlStripper addAllowedTag(String tagName) { allowedTags.add(tagName); return this; } /** * Adds allowed tag names * @param tagNames tag names without < or > * @return this. For method-chaining */ public HtmlStripper addAllowedTags(String... tagNames) { for (String tagName : tagNames) { addAllowedTag(tagName); } return this; } /** * Removes allowed tag name * @param tagName * @return this. For method-chaining */ public HtmlStripper removeAllowedTag(String tagName) { allowedTags.remove(tagName); return this; } /** * Adds allowed attribute name * @param attributeName attribute name without ' or " * @return this. For method-chaining */ public HtmlStripper addAllowedAttribute(String attributeName) { allowedAttributes.add(attributeName); return this; } /** * Adds allowed attribute names * @param attributeNames attribute name without ' or " * @return this. For method-chaining */ public HtmlStripper addAllowedAttributes(String... attributeNames) { for (String attributeName : attributeNames) { addAllowedAttribute(attributeName); } return this; } /** * Removes allowed attribute name * @param attributeName attribute name without ' or " * @return this. For method-chaining */ public HtmlStripper removeAllowedAttribute(String attributeName) { allowedAttributes.remove(attributeName); return this; } public HtmlStripper clearAllowedTags() { allowedTags.clear(); return this; } public HtmlStripper clearAllowedAttributes() { allowedAttributes.clear(); return this; } private void reencodeTextSegment(Source source, OutputDocument output, int begin, int end) { if (begin >= end) return; Segment textSegment = new Segment(source, begin, end); String decodedText = CharacterReference.decode(textSegment); String encodedText = CharacterReference.encode(decodedText); output.replace(textSegment, encodedText); } private boolean processTag(Tag tag, OutputDocument output) { String elementName = tag.getName().toLowerCase(); if (!allowedTags.contains(elementName)) return false; if (tag.getTagType() == StartTagType.NORMAL) { Element element = tag.getElement(); if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) { if (element.getEndTag() == null) return false; // reject start tag if its required end tag is // missing } else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) { if (elementName == HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags if (element.getEndTag() == null) // insert optional end tag if it is missing output.insert(element.getEnd(), getEndTagHTML(elementName)); } output.replace(tag, getStartTagHTML(element.getStartTag())); } else if (tag.getTagType() == EndTagType.NORMAL) { if (tag.getElement() == null) return false; // reject end tags that aren't associated with a // start tag if (elementName == HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags output.replace(tag, getEndTagHTML(elementName)); } else { return false; // reject abnormal tags } return true; } private CharSequence getStartTagHTML(StartTag startTag) { // tidies and filters out non-approved attributes StringBuilder sb = new StringBuilder(); sb.append('<').append(startTag.getName()); for (Attribute attribute : startTag.getAttributes()) { if (allowedAttributes.contains(attribute.getKey())) { sb.append(' ').append(attribute.getName()); if (attribute.getValue() != null) { sb.append("=\""); sb.append(CharacterReference.encode(attribute.getValue())); sb.append('"'); } } } if (startTag.getElement().getEndTag() == null && !HTMLElements.getEndTagOptionalElementNames().contains(startTag.getName())) sb.append(" /"); sb.append('>'); return sb; } private static String getEndTagHTML(String tagName) { return "</" + tagName + ">"; } private boolean isValidLITag(Tag tag) { Element parentElement = tag.getElement().getParentElement(); if (parentElement == null) return false; // ignore LI elements without a parent if (parentElement.getStartTag().getUserData() != VALID_MARKER) return false; // ignore LI elements who's parent is not valid // only accept LI tags who's immediate parent is UL or OL. return parentElement.getName() == HTMLElementName.UL || parentElement.getName() == HTMLElementName.OL; } }