/**
* Copyright (c) 2009 Juwi MacMillan Group GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tizzit.util;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Vector;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jaxen.BaseXPath;
import org.jaxen.dom.DOMXPath;
import org.tizzit.util.tidy.Configuration;
import org.tizzit.util.tidy.Tidy;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
/**
* Helperclass for XML/XPath Functionality.
* <p>Copyright: Copyright JuwiMacMillan Group GmbH (c) 2003</p>
* @author <a href="mailto:s.kulawik@juwimm.com">Sascha-Matthias Kulawik</a>
* @version $Id$
*/
public final class XercesHelper {
private static Log log = LogFactory.getLog(XercesHelper.class);
private static DocumentBuilder docBuilder = null;
private static DocumentBuilderFactory dbf = null;
private static final String ENTITIES_RESOURCE = "/HTMLEntities.res";
private static Hashtable byChar;
private static Hashtable byName;
static {
initialize();
try {
dbf = DocumentBuilderFactory.newInstance();
dbf.setValidating(false);
dbf.setNamespaceAware(false);
//dbf.setExpandEntityReferences(false);
//dbf.setAttribute("http://xml.org/sax/features/validation", "false");
//dbf.setAttribute("http://apache.org/xml/features/nonvalidating/load-external-dtd", "false");
docBuilder = dbf.newDocumentBuilder();
} catch (Exception exe) {
log.error("Fatal Error occured while getting the DocumentBuilderFactory", exe);
}
}
private XercesHelper() {
}
private static void initialize() {
InputStream is = null;
BufferedReader reader = null;
int index;
String name;
String value;
int code;
String line;
// Make sure not to initialize twice.
if (byName != null) return;
try {
byName = new Hashtable();
byChar = new Hashtable();
is = XercesHelper.class.getResourceAsStream(ENTITIES_RESOURCE);
if (is == null)
throw new RuntimeException("SER003 The resource [" + ENTITIES_RESOURCE + "] could not be found.\n"
+ ENTITIES_RESOURCE);
reader = new BufferedReader(new InputStreamReader(is));
line = reader.readLine();
while (line != null) {
if (line.length() == 0 || line.charAt(0) == '#') {
line = reader.readLine();
continue;
}
index = line.indexOf(' ');
if (index > 1) {
name = line.substring(0, index);
++index;
if (index < line.length()) {
value = line.substring(index);
index = value.indexOf(' ');
if (index > 0) value = value.substring(0, index);
code = Integer.parseInt(value);
defineEntity(name, (char) code);
}
}
line = reader.readLine();
}
is.close();
} catch (Exception except) {
throw new RuntimeException("SER003 The resource [" + ENTITIES_RESOURCE + "] could not load: "
+ except.toString() + "\n" + ENTITIES_RESOURCE + "\t" + except.toString());
} finally {
if (is != null) {
try {
is.close();
} catch (Exception except) {
}
}
}
}
private static void defineEntity(String name, char value) {
if (byName.get(name) == null) {
byName.put(name, new Integer(value));
byChar.put(new Integer(value), name);
}
}
public static synchronized Document getNewDocument() {
Document doc = null;
try {
doc = docBuilder.newDocument();
} catch (Exception exe) {
log.error("unknown error occured", exe);
}
return doc;
}
public static String doc2String(Document doc) {
StringWriter stringOut = new StringWriter();
try {
Transformer t = TransformerFactory.newInstance().newTransformer();
t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
t.setOutputProperty(OutputKeys.METHOD, "xml");
t.transform(new DOMSource(doc), new StreamResult(stringOut));
/*OutputFormat format = new OutputFormat(doc, "ISO-8859-1", true);
format.setOmitXMLDeclaration(true);
format.setOmitDocumentType(true);
stringOut = new StringWriter();
XMLSerializer serial = new XMLSerializer(stringOut, format);
serial.asDOMSerializer();
serial.serialize(doc);*/
} catch (Exception exe) {
log.error("unknown error occured", exe);
}
return stringOut.toString();
}
public static String node2Html(Node node) {
Document doc = getNewDocument();
Node newnde = doc.importNode(node, true);
doc.appendChild(newnde);
StringWriter stringOut = new StringWriter();
try {
Transformer t = TransformerFactory.newInstance().newTransformer();
// for "XHTML" serialization, use the output method "xml" and set publicId as shown
/*t.setOutputProperty(OutputKeys.METHOD, "xml");
t.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, "-//W3C//DTD XHTML 1.0 Transitional//EN");
t.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM,
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd");*/
t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
t.setOutputProperty(OutputKeys.METHOD, "xml");
t.transform(new DOMSource(doc), new StreamResult(stringOut));
} catch (Exception exe) {
log.error("unknown error occured", exe);
}
return stringOut.toString();
}
public static Node findNode(Node node, String xpathquery) throws Exception {
Iterator it = findNodes(node, xpathquery);
if (it.hasNext()) {
return (Node) it.next();
}
return null;
}
public static Iterator findNodes(Node node, String xpathquery) {
Iterator it = null;
try {
//XPath expression = DocumentHelper.createXPath(xpathquery);
BaseXPath expression = new DOMXPath(xpathquery);
//Node productNode = XPathAPI.selectSingleNode(doc,xPath);
/* THIS IS A DOM3 TEST
XPathEvaluator xpe = (XPathEvaluator) node;
Document response = null;
xpe.evaluate(xpathquery, response, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null);
NodeList nl = response.getChildNodes();
ArrayList al = new ArrayList(nl.getLength());
for(int i=0;i<nl.getLength();i++) {
al.add(nl.item(i));
}
return al.iterator();
*/
it = expression.selectNodes(node).iterator();
} catch (Exception exe) {
}
if (it == null) {
it = new Vector().iterator();
}
return it;
}
public static String getNodeValue(Node node, String path) {
String retString = "";
try {
Node lNode = findNode(node, path);
retString = lNode.getFirstChild().getNodeValue();
} catch (Exception e) {
}
return retString;
}
public static String getNodeValue(Node nde) {
String retString = "";
try {
retString = nde.getFirstChild().getNodeValue();
} catch (Exception e) {
}
return retString;
}
public static Node renameNode(Node nde, String strName) {
if (!strName.equals(nde.getNodeName())) {
Document xdoc = nde.getOwnerDocument();
Element retnode = xdoc.createElement(strName);
NodeList nl = nde.getChildNodes();
for (int i = 0; i < nl.getLength(); i++) {
retnode.appendChild(nl.item(i).cloneNode(true));
}
NamedNodeMap al = nde.getAttributes();
for (int i = 0; i < al.getLength(); i++) {
Attr attr = (Attr) al.item(i);
retnode.setAttribute(attr.getName(), attr.getValue());
}
return retnode;
}
return nde;
}
/**
* Easily creates a new Node, containing Text and returns the new created Node.
* @param doc The Node to append to
* @param elementName The Name of the new Node
* @param elementText The Text to insert into the Node
* @return The created Node
*/
public static Element createTextNode(Node doc, String elementName, String elementText) {
Element elm = doc.getOwnerDocument().createElement(elementName);
Text elmTxt = doc.getOwnerDocument().createTextNode(elementText);
elm.appendChild(elmTxt);
doc.appendChild(elm);
return elm;
}
public static synchronized Document file2Dom(File file) throws Exception {
return docBuilder.parse(file);
}
public static synchronized Document inputstream2Dom(InputStream in) throws Exception {
/*DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/validation", false);
parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
parser.parse(new InputSource(in));
return parser.getDocument();*/
return docBuilder.parse(in); //this will be always validated in xerces. Why?
}
public static synchronized Document inputSource2Dom(InputSource in) throws Exception {
return docBuilder.parse(in);
}
public static synchronized Document string2Dom(String strXML) throws Exception {
InputSource in = new InputSource(new StringReader(strXML));
return docBuilder.parse(in);
/*
InputSource in = new InputSource(new java.io.StringReader(strXML));
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/validation", false);
parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
parser.parse(in);
return parser.getDocument();
*/
}
public static String node2string(Node nde) {
StringBuffer sb = new StringBuffer();
String attributes = "";
if (nde.hasAttributes()) {
NamedNodeMap attr = nde.getAttributes();
for (int j = 0; j < attr.getLength(); j++) {
attributes += " " + attr.item(j).getNodeName() + "=\"" + getHexEncoded(attr.item(j).getNodeValue())
+ "\"";
}
}
sb.append("<" + nde.getNodeName() + attributes);
if (nde.hasChildNodes()) {
sb.append(">" + nodeList2string(nde.getChildNodes()) + "</" + nde.getNodeName() + ">");
} else {
sb.append("/>");
}
return sb.toString();
}
public static String nodeList2string(NodeList nl) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < nl.getLength(); i++) {
Node nde = nl.item(i);
String attributes = "";
if (nde.getNodeType() == Node.TEXT_NODE) {
sb.append(getHexEncoded(nde.getNodeValue()));
if (nde.hasChildNodes()) {
sb.append(nodeList2string(nde.getChildNodes()));
}
} else if (nde.getNodeType() == Node.CDATA_SECTION_NODE) {
sb.append("<![CDATA[" + nde.getNodeValue() + "]]>");
} else if (nde.getNodeType() == Node.COMMENT_NODE) {
sb.append("<!-- -->");
} else {
if (nde.hasAttributes()) {
NamedNodeMap attr = nde.getAttributes();
for (int j = 0; j < attr.getLength(); j++) {
attributes += " " + attr.item(j).getNodeName() + "=\""
+ getHexEncoded(attr.item(j).getNodeValue()) + "\"";
}
}
sb.append("<" + nde.getNodeName() + attributes);
if (nde.hasChildNodes()) {
sb.append(">" + nodeList2string(nde.getChildNodes()) + "</" + nde.getNodeName() + ">");
} else {
sb.append("/>");
}
}
}
return sb.toString();
}
public static String getHexEncoded(String utf8String) {
if (utf8String != null) {
boolean changed = false;
StringBuffer sbRet = new StringBuffer();
for (int i = 0; i < utf8String.length(); i++) {
int val = (int) utf8String.charAt(i);
if (val > 128) {
changed = true;
sbRet.append("").append(val).append(";");
} else if (val == 38) { // &
sbRet.append("&");
} else if (val == 60) { // <
sbRet.append("<");
} else if (val == 62) { // >
sbRet.append(">");
} else if (val == 34) { // "
sbRet.append(""");
} else {
sbRet.append(utf8String.charAt(i));
}
}
if (log.isDebugEnabled() && changed) {
log.debug("changed string from: " + utf8String + " to " + sbRet.toString());
}
return sbRet.toString();
}
return null;
}
public static String getHexDecoded(String hexString) {
if (hexString != null) {
boolean changed = false;
StringBuffer sbRet = new StringBuffer();
for (int i = 0; i < hexString.length(); i++) {
if (hexString.charAt(i) == '&' && hexString.charAt(i + 1) == '#') {
// we've found one
try {
int endAt = hexString.indexOf((int) ';', i);
int charCode = new Integer(hexString.substring(i + 2, endAt)).intValue();
sbRet.append((char) charCode);
i = endAt;
} catch (Exception exe) {
log.warn("uncorrectly escaped string, returning original value");
sbRet.append(hexString.charAt(i));
}
} else {
sbRet.append(hexString.charAt(i));
}
}
if (log.isDebugEnabled() && changed) {
log.debug("changed string from: " + hexString + " to " + sbRet.toString());
}
return sbRet.toString();
}
return null;
}
public static Node html2node(String html) {
//Import HTML and convert it to XHTML
Tidy myTidy = new Tidy();
myTidy.setQuoteAmpersand(true);
myTidy.setQuoteNbsp(true);
//myTidy.setQuoteMarks(true);
myTidy.setXmlOut(true);
//myTidy.setXmlTags(true);
myTidy.setCharEncoding(Configuration.ISO2022);
myTidy.setShowWarnings(false);
myTidy.setRawOut(false);
myTidy.setQuiet(true);
myTidy.setNumEntities(false);
InputStream in = new ByteArrayInputStream(html.getBytes());
OutputStream outStream = new ByteArrayOutputStream();
myTidy.parseDOM(in, outStream);
String strOut = outStream.toString();
Node nde = null;
try {
Document htmlDoc = XercesHelper.string2Dom(strOut);
nde = XercesHelper.findNode(htmlDoc, "//body");
} catch (Exception exe) {
}
return nde;
}
public static String html2nodeUTF8(String html) throws Exception {
if(log.isDebugEnabled()){
log.debug("html2nodeUTF8 - with param: " + html.substring(0, (html.length()>20)?20:html.length()-1));
log.debug("Thread " + Thread.currentThread().getId() + " \"" + Thread.currentThread().getName() + "\": ");
}
Tidy myTidy = new Tidy();
myTidy.setQuoteNbsp(false);
myTidy.setXmlOut(true);
myTidy.setCharEncoding(Configuration.UTF8);
myTidy.setShowWarnings(false);
myTidy.setRawOut(true);
myTidy.setQuoteAmpersand(false);
myTidy.setQuiet(true);
myTidy.setNumEntities(false);
InputStream in = new ByteArrayInputStream(html.getBytes("UTF-8"));
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
myTidy.parseDOM(in, outStream);
String strOut = outStream.toString("UTF-8");
Node nde = null;
try {
org.w3c.dom.Document htmlDoc = XercesHelper.string2Dom(strOut);
nde = XercesHelper.findNode(htmlDoc, "//body");
} catch (Exception exe) {
}
String text = "";
if (nde != null) {
text = XercesHelper.node2string(nde);
text = text.replaceAll("<body>", "");
text = text.replaceAll("</body>", "");
}
return text;
}
/* public static String html2nodeUTF8(String html) {
//Import HTML and convert it to XHTML
Tidy myTidy = new Tidy();
myTidy.setQuoteAmpersand(false);
myTidy.setQuoteNbsp(true);
//myTidy.setQuoteMarks(true);
myTidy.setXmlOut(true);
//myTidy.setXmlTags(true);
myTidy.setCharEncoding(Configuration.UTF8);
myTidy.setShowWarnings(false);
myTidy.setRawOut(true);
myTidy.setQuiet(true);
myTidy.setNumEntities(false);
InputStream in = new ByteArrayInputStream(html.getBytes());
OutputStream outStream = new ByteArrayOutputStream();
myTidy.parseDOM(in, outStream);
return outStream.toString();
}*/
/**
*
* @param html
* @return
*/
public static String html2utf8string(String html) {
Iterator it = byName.keySet().iterator();
while (it.hasNext()) {
String key = (String) it.next();
html = html.replaceAll("&" + key + ";", (char) ((Integer) byName.get(key)).intValue() + "");
}
return html;
}
}