/*
This file is part of Cyclos (www.cyclos.org).
A project of the Social Trade Organisation (www.socialtrade.org).
Cyclos is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Cyclos is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Cyclos; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package nl.strohalm.cyclos.utils.conversion;
import java.io.StringReader;
import nl.strohalm.cyclos.utils.XmlHelper;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
/**
* Used to handle HTML formatted values, removing script tags and converting paragraphs into line breaks
* @author luis
*/
public class HtmlConverter implements Converter<String> {
private static final long serialVersionUID = -1184040713929519035L;
private static final String[] BAD_TAGS = { "script", "style", "iframe", "form" };
// it's initialized because this mode (with flag in true) is the most used.
private static final HtmlConverter INSTANCE = new HtmlConverter(true);
// this mode is initialized on demand.
private static HtmlConverter INSTANCE_NBSP = null;
private static final int NBSP = 160;
public static HtmlConverter instance() {
return instance(true);
}
public static HtmlConverter instance(final boolean removeBlankspaces) {
if (removeBlankspaces) {
return INSTANCE;
} else {
if (INSTANCE_NBSP == null) {
INSTANCE_NBSP = new HtmlConverter(false);
}
return INSTANCE_NBSP;
}
}
private static void removeBadNodes(final Document document) {
final NodeList elements = document.getElementsByTagName("*");
for (int i = 0; i < elements.getLength(); i++) {
final Element element = (Element) elements.item(i);
if (ArrayUtils.contains(BAD_TAGS, element.getTagName())) {
element.getParentNode().removeChild(element);
}
final NamedNodeMap attributes = element.getAttributes();
for (int j = 0; j < attributes.getLength(); j++) {
final Attr attr = (Attr) attributes.item(j);
if (attr.getNodeName().startsWith("on")) {
// This is an event handler: remove it
element.removeAttributeNode(attr);
}
}
}
}
private boolean removeBlankspaces;
private HtmlConverter(final boolean removeBlankspaces) {
this.removeBlankspaces = removeBlankspaces;
}
public String toString(final String string) {
return string;
}
public String valueOf(final String string) {
if (StringUtils.isBlank(string)) {
return removeBlankspaces ? null : string;
}
final Tidy tidy = new Tidy(); // obtain a new Tidy instance
tidy.setXHTML(false); // set desired config options using tidy setters
tidy.setQuiet(true);
tidy.setShowErrors(0);
tidy.setShowWarnings(false);
tidy.setIndentContent(false);
tidy.setXmlOut(true);
final Document document = tidy.parseDOM(new StringReader(string), null);
removeBadNodes(document);
final NodeList bodies = document.getElementsByTagName("body");
if (bodies.getLength() == 0) {
// No body element? return null
return null;
} else {
// Result will contain the xml header plus the body element itself. We need to body content only
String result = XmlHelper.toString(bodies.item(0));
result = result.substring(result.indexOf("<body>") + "<body>".length(), result.indexOf("</body>"));
// Remove the nbsps
if (removeBlankspaces) {
int begin = 0;
while (result.charAt(begin) == NBSP) {
begin++;
if (begin == result.length()) {
// All the string was NBSPs
return null;
}
}
int end = result.length();
while (result.charAt(end - 1) == NBSP) {
end--;
}
return StringUtils.trimToNull(result.substring(begin, end));
} else {
return StringUtils.trimToNull(result);
}
}
}
}