package it.sauronsoftware.feed4j.html;
import it.sauronsoftware.feed4j.bean.RawAttribute;
import it.sauronsoftware.feed4j.bean.RawElement;
import it.sauronsoftware.feed4j.bean.RawNode;
import it.sauronsoftware.feed4j.bean.RawText;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.html.dom.HTMLDocumentImpl;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;
/**
* HTML fragments parser and generator.
*
* @author Carlo Pelliccia
*/
public class HTMLFragmentHelper {
/**
* XML namespace.
*/
private static final String XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace";
/**
* XHTML namespace.
*/
private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
/**
* This method extracts a plain text from a HTML fragment.
*
* @param str
* The HTML fragment.
* @return The plain text extracted from the fragment.
*/
public static String fromHTMLtoTextPlain(String str) {
DOMFragmentParser parser = new DOMFragmentParser();
HTMLDocument document = new HTMLDocumentImpl();
DocumentFragment fragment = document.createDocumentFragment();
try {
parser.parse(new InputSource(new StringReader(str)), fragment);
} catch (Exception e) {
return null;
}
return nodeToText(fragment);
}
private static String nodeToText(Node node) {
StringBuffer buffer = new StringBuffer();
if (node instanceof Text) {
Text text = (Text) node;
buffer.append(text.getData());
buffer.append(' ');
} else {
NodeList list = node.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
buffer.append(nodeToText(list.item(i)));
buffer.append(' ');
}
}
String ret = buffer.toString();
return ret.replaceAll("\\s+", " ").trim();
}
/**
* This method extracts a plain text from a XHTML fragment.
*
* @param element
* The XHTML fragment as an XML raw element.
* @return The plain text extracted from the fragment.
*/
public static String fromXHTMLtoTextPlain(RawElement element) {
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < element.getNodeCount(); i++) {
RawNode node = element.getNode(i);
if (node instanceof RawElement) {
RawElement sub = (RawElement) node;
buffer.append(fromXHTMLtoTextPlain(sub));
} else if (node instanceof RawText) {
RawText sub = (RawText) node;
buffer.append(sub.getText());
}
}
String ret = buffer.toString();
return ret.replaceAll("\\s+", " ").trim();
}
/**
* This method encodes a plain text fragment in a HTML one.
*
* @param str
* The plain text fragment.
* @return The encoded HTML fragment.
*/
public static String fromTextPlainToHTML(String str) {
return HTMLEntities.encode(str);
}
/**
* This method changes a XHTML fragment in a HTML one.
*
* @param element
* The XHTML fragment as an XML raw element.
* @return The HTML fragment as a string.
*/
public static String fromXHTMLtoHTML(RawElement element) {
return fromXHTMLtoHTML(element, null);
}
/**
* This method changes a XHTML fragment in a HTML one.
*
* @param element
* The XHTML fragment as an XML raw element.
* @param base
* The base URL for link href and image src absolute
* reconstruction.
* @return The HTML fragment as a string.
*/
private static String fromXHTMLtoHTML(RawElement element, URL base) {
String aux = element.getAttributeValue(XML_NAMESPACE, "base");
if (aux != null) {
try {
base = new URL(aux);
} catch (MalformedURLException e) {
;
}
}
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < element.getNodeCount(); i++) {
RawNode node = element.getNode(i);
if (node instanceof RawText) {
RawText rawText = (RawText) node;
buffer.append(rawText.getText());
} else if (node instanceof RawElement) {
RawElement rawElement = (RawElement) node;
if (rawElement.getNamespaceURI().equals(XHTML_NAMESPACE)) {
buffer.append('<');
buffer.append(rawElement.getName());
for (int j = 0; j < rawElement.getAttributeCount(); j++) {
RawAttribute rawAttribute = rawElement.getAttribute(j);
if (rawAttribute.getNamespaceURI().equals(
XHTML_NAMESPACE)) {
String attrname = rawAttribute.getName();
String attrvalue = rawAttribute.getValue();
attrvalue = applBase(attrname, attrvalue, base);
buffer.append(' ');
buffer.append(attrname);
buffer.append('=');
buffer.append('"');
buffer.append(HTMLEntities.encode(attrvalue));
buffer.append('"');
}
}
String value = rawElement.getValue();
if (value != null) {
buffer.append('>');
buffer.append(HTMLEntities.encode(value));
buffer.append('<');
buffer.append('/');
buffer.append(rawElement.getName());
buffer.append('>');
} else if (rawElement.getNodeCount() > 0) {
buffer.append('>');
buffer.append(fromXHTMLtoHTML(rawElement, base));
buffer.append('<');
buffer.append('/');
buffer.append(rawElement.getName());
buffer.append('>');
} else {
buffer.append(' ');
buffer.append('/');
buffer.append('>');
}
}
}
}
String ret = buffer.toString();
return ret.replaceAll("\\s+", " ").trim();
}
private static String applBase(String name, String value, URL base) {
if (base != null && (name.equals("href") || name.equals("src"))) {
if (value.indexOf(':') == -1) {
try {
URL aux = new URL(base, value);
value = aux.toExternalForm();
} catch (MalformedURLException e) {
;
}
}
}
return value;
}
}