/** * Copyright (c) 2009-2014 Câmara dos Deputados. Todos os direitos reservados. * * e-Democracia é um software livre; você pode redistribuí-lo e/ou modificá-lo dentro * dos termos da Licença Pública Geral Menor GNU como publicada pela Fundação do * Software Livre (FSF); na versão 2.1 da Licença, ou (na sua opinião) qualquer versão. * * Este programa é distribuído na esperança de que possa ser útil, mas SEM NENHUMA GARANTIA; * sem uma garantia implícita de ADEQUAÇÃO a qualquer MERCADO ou APLICAÇÃO EM PARTICULAR. * Veja a Licença Pública Geral Menor GNU para maiores detalhes. */ package br.gov.camara.edemocracia.portlets.priorizacao.ui.util; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import net.htmlparser.jericho.Attribute; import net.htmlparser.jericho.CharacterReference; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.EndTagType; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.HTMLElements; import net.htmlparser.jericho.OutputDocument; import net.htmlparser.jericho.Segment; import net.htmlparser.jericho.Source; import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; import net.htmlparser.jericho.Tag; /** * @author robson * */ public class HtmlStripper { private static final Set<String> ALLOWED_TAGS = new HashSet<String>( Arrays.asList(HTMLElementName.BR, HTMLElementName.P,HTMLElementName.EM, HTMLElementName.STRONG, HTMLElementName.I, HTMLElementName.U, HTMLElementName.SPAN, HTMLElementName.OL, HTMLElementName.UL, HTMLElementName.LI,HTMLElementName.A)); private static final Set<String> ALLOWED_ATTRIBUTES = new HashSet<String>( Arrays.asList("alt", "href", "target")); private final Set<String> allowedTags; private final Set<String> allowedAttributes; public HtmlStripper() { allowedTags = new HashSet<String>(ALLOWED_TAGS); allowedAttributes = new HashSet<String>(ALLOWED_ATTRIBUTES); } private static final Object VALID_MARKER = new Object(); /** * Retira tags indesejadas * * @param html * @return */ public String strip(String html) { if (html == null) return ""; Source source = new Source(html); source.fullSequentialParse(); OutputDocument output = new OutputDocument(source); List<Tag> tags = source.getAllTags(); int pos = 0; for (Tag tag : tags) { if (processTag(tag, output)) { tag.setUserData(VALID_MARKER); } else { output.remove(tag); } reencodeTextSegment(source, output, pos, tag.getBegin()); pos = tag.getEnd(); } reencodeTextSegment(source, output, pos, source.getEnd()); return output.toString(); } private void reencodeTextSegment(Source source, OutputDocument output, int begin, int end) { if (begin >= end) return; Segment textSegment = new Segment(source, begin, end); String decodedText = CharacterReference.decode(textSegment); String encodedText = CharacterReference.encode(decodedText); output.replace(textSegment, encodedText); } private boolean processTag(Tag tag, OutputDocument output) { String elementName = tag.getName().toLowerCase(); if (!allowedTags.contains(elementName)) return false; if (tag.getTagType() == StartTagType.NORMAL) { Element element = tag.getElement(); if (HTMLElements.getEndTagRequiredElementNames().contains( elementName)) { if (element.getEndTag() == null) return false; // reject start tag if its required end tag is // missing } else if (HTMLElements.getEndTagOptionalElementNames().contains( elementName)) { if (elementName == HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags if (element.getEndTag() == null) output.insert(element.getEnd(), getEndTagHTML(elementName)); // insert // optional // end // tag // if // it // is // missing } output.replace(tag, getStartTagHTML(element.getStartTag())); } else if (tag.getTagType() == EndTagType.NORMAL) { if (tag.getElement() == null) return false; // reject end tags that aren't associated with a // start tag if (elementName == HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags output.replace(tag, getEndTagHTML(elementName)); } else { return false; // reject abnormal tags } return true; } private CharSequence getStartTagHTML(StartTag startTag) { // tidies and filters out non-approved attributes StringBuilder sb = new StringBuilder(); sb.append('<').append(startTag.getName()); for (Attribute attribute : startTag.getAttributes()) { if (allowedAttributes.contains(attribute.getKey())) { sb.append(' ').append(attribute.getName()); if (attribute.getValue() != null) { sb.append("=\""); sb.append(CharacterReference.encode(attribute.getValue())); sb.append('"'); } } } if (startTag.getElement().getEndTag() == null && !HTMLElements.getEndTagOptionalElementNames().contains( startTag.getName())) sb.append(" /"); sb.append('>'); return sb; } private static String getEndTagHTML(String tagName) { return "</" + tagName + ">"; } private boolean isValidLITag(Tag tag) { Element parentElement = tag.getElement().getParentElement(); if (parentElement == null) return false; // ignore LI elements without a parent if (parentElement.getStartTag().getUserData() != VALID_MARKER) return false; // ignore LI elements who's parent is not valid return parentElement.getName() == HTMLElementName.UL || parentElement.getName() == HTMLElementName.OL; // only accept // LI tags // who's // immediate // parent is // UL or OL. } }