/* * JBoss, Home of Professional Open Source * Copyright 2013 Red Hat Inc. and/or its affiliates and other contributors * as indicated by the @authors tag. All rights reserved. */ package org.searchisko.preprocessor; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.safety.Whitelist; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; /** * Strips HTML utils. * * @author Lukáš Vlček */ public class HTMLStripUtil { /** * Strip HTML out of input string. * @param html * @return */ public static String stripHTML(String html) { if (html != null) { Document doc = Jsoup.parse(Jsoup.clean(html, Whitelist.relaxed())); return convertNodeToText(doc.body()); } return null; } private static String convertNodeToText(Element element) { if (element == null) return ""; final StringBuilder buffer = new StringBuilder(); new NodeTraversor(new NodeVisitor() { @Override public void head(Node node, int depth) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.text().replace('\u00A0', ' ').trim(); // non breaking space if(!text.isEmpty()) { buffer.append(text); if (!text.endsWith(" ")) { buffer.append(" "); // the last text gets appended the extra space too but we remove it later } } } } @Override public void tail(Node node, int depth) {} }).traverse(element); String output = buffer.toString(); if (output.endsWith(" ")) { // removal of the last extra space output = output.substring(0, output.length() - 1); } return output; } }