/*
* This library is part of OpenCms -
* the Open Source Content Management System
*
* Copyright (c) Alkacon Software GmbH (http://www.alkacon.com)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* For further information about Alkacon Software GmbH, please see the
* company website: http://www.alkacon.com
*
* For further information about OpenCms, please see the
* project website: http://www.opencms.org
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.opencms.workplace.tools.database;
import org.opencms.file.CmsPropertyDefinition;
import org.opencms.i18n.CmsEncoder;
import org.opencms.main.CmsLog;
import org.opencms.util.CmsStringUtil;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
/**
* This class implements Html-converting routines based on tidy to modify the
* Html code of the imported Html pages.<p>
*
* @since 6.0.0
*/
public class CmsHtmlImportConverter {
/** defintition of the alt attribute. */
private static final String ATTRIB_ALT = "alt";
/** defintition of the content attribute. */
private static final String ATTRIB_CONTENT = "content";
/** defintition of the href attribute. */
private static final String ATTRIB_HREF = "href";
/** defintition of the name attribute. */
private static final String ATTRIB_NAME = "name";
/** defintition of the src attribute. */
private static final String ATTRIB_SRC = "src";
/** defintition of the <BODY></BODY> node. */
private static final String NODE_BODY = "body";
/** defintition of the <HEAD></HEAD> node. */
private static final String NODE_HEAD = "head";
/** defintition of the <A></A> node. */
private static final String NODE_HREF = "a";
/** defintition of the <HTML></HTML> node. */
private static final String NODE_HTML = "html";
/** defintition of the <IMG></IMG> node. */
private static final String NODE_IMG = "img";
/** defintition of the <META></META> node. */
private static final String NODE_META = "meta";
/** defintition of the <TITLE></TITLE> node. */
private static final String NODE_TITLE = "title";
/**
* HashMap stores tag names, after the end-tag, a "\n" is added to the output.<p>
*/
private HashSet m_enterTags = new HashSet();
/**
* the absolute path in the real filesystem of the file to convert.
*/
private String m_filename;
/**
* reference to the HtmlImport object, required to access the link translation.
*/
private CmsHtmlImport m_htmlImport;
/**
* temporary buffer used in transformation method.
*/
private StringBuffer m_tempString;
/** instance of JTidy. */
private Tidy m_tidy = new Tidy();
/** flag to write the output. */
private boolean m_write;
/**
* Default constructor, creates a new HtmlConverter.<p>
*
* @param htmlImport reference to the htmlimport
* @param xmlMode switch for setting the import to HTML or XML mode
*/
public CmsHtmlImportConverter(CmsHtmlImport htmlImport, boolean xmlMode) {
m_tidy.setTidyMark(false);
m_tidy.setShowWarnings(false);
m_tidy.setQuiet(true);
m_tidy.setForceOutput(true);
if (xmlMode) {
m_tidy.setXmlTags(xmlMode);
m_tidy.setXmlSpace(true);
}
initialiseTags();
m_htmlImport = htmlImport;
}
/**
* Extracts the content of a HTML page.<p>
*
* This method should be pretty robust and work even if the input HTML does not contains
* the specified matchers.<p>
*
* @param content the content to extract the body from
* @param startpoint the point where matching starts
* @param endpoint the point where matching ends
* @return the extracted body tag content
*/
public static String extractHtml(String content, String startpoint, String endpoint) {
/** Regex that matches a start body tag. */
Pattern startPattern = Pattern.compile(startpoint, Pattern.CASE_INSENSITIVE);
/** Regex that matches an end body tag. */
Pattern endPattern = Pattern.compile(endpoint, Pattern.CASE_INSENSITIVE);
Matcher startMatcher = startPattern.matcher(content);
Matcher endMatcher = endPattern.matcher(content);
int start = 0;
int end = content.length();
if (startMatcher.find()) {
start = startMatcher.end();
}
if (endMatcher.find(start)) {
end = endMatcher.start();
}
return content.substring(start, end);
}
/**
* Transforms HTML code into user defined output.<p>
*
* @param input Reader with HTML code
* @param output Writer with transformed code
* @param startPattern the start pattern definition for content extracting
* @param endPattern the end pattern definition for content extracting
* @param properties the file properties
*/
public void convertHTML(Reader input, Writer output, String startPattern, String endPattern, Hashtable properties) {
/* local variables */
StringBuffer htmlString = new StringBuffer();
Node node;
String outString = "";
try {
/* write InputStream input in StringBuffer htmlString */
int c;
while ((c = input.read()) != -1) {
htmlString.append((char)c);
}
} catch (IOException e) {
if (CmsLog.INIT.isWarnEnabled()) {
CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0, e.getLocalizedMessage()));
}
return;
}
outString = htmlString.toString();
// extract from html if even both patterns are defined
if (CmsStringUtil.isNotEmpty(startPattern) && CmsStringUtil.isNotEmpty(endPattern)) {
String extractMain = extractHtml(outString, startPattern, endPattern);
if (extractMain.length() != outString.length()) {
String extractHead = extractHtml(outString, "<html>", CmsStringUtil.BODY_START_REGEX);
//String extractHead = extractHtml(extractMain, "<html>", CmsStringUtil.C_BODY_START_REGEX);
StringBuffer buffer = new StringBuffer(extractHead.length() + extractMain.length() + 255);
buffer.append("<html>");
buffer.append(extractHead);
buffer.append("<body>");
buffer.append(extractMain);
buffer.append("</body></html>");
outString = buffer.toString();
}
}
/* convert htmlString in InputStream for parseDOM */
InputStream in;
try {
in = new ByteArrayInputStream(outString.getBytes(CmsEncoder.ENCODING_UTF_8));
} catch (UnsupportedEncodingException e) {
// this should never happen since UTF-8 is always supported
in = new ByteArrayInputStream(outString.getBytes());
}
m_tidy.setInputEncoding(CmsEncoder.ENCODING_UTF_8);
m_tidy.setOutputEncoding(CmsEncoder.ENCODING_UTF_8);
// hold tidy error information into a new PrintWriter Object
PrintWriter errorLog = new PrintWriter(new ByteArrayOutputStream(), true);
m_tidy.setErrout(errorLog);
node = m_tidy.parseDOM(in, null);
/* check if html code has errors */
if (m_tidy.getParseErrors() != 0) {
if (CmsLog.INIT.isWarnEnabled()) {
CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0));
}
}
/* second step: create transformed output with printDocument from DOM */
this.printDocument(node, properties);
try {
String content = m_tempString.toString();
content = CmsStringUtil.substitute(content, "<br></br>", "<br>");
content = CmsStringUtil.substitutePerl(content, "</a>(\\w+)", "</a> $1", "g");
output.write(content);
output.close();
} catch (IOException e) {
if (CmsLog.INIT.isWarnEnabled()) {
CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_1, e.getLocalizedMessage()));
}
return;
}
}
/**
* Transforms HTML code into user defined output.<p>
*
* @param filename the absolute path in the real filesystem of the file to convert
* @param inString String with HTML code
* @param startPattern the start pattern definition for content extracting
* @param endPattern the end pattern definition for content extracting
* @param properties the file properties
* @return String with transformed code
*/
public String convertHTML(
String filename,
String inString,
String startPattern,
String endPattern,
Hashtable properties) {
m_tempString = new StringBuffer();
m_write = true;
m_filename = filename.replace('\\', '/');
Reader in = new StringReader(inString);
Writer out = new StringWriter();
convertHTML(in, out, startPattern, endPattern, properties);
return out.toString();
}
/**
* Initialises Vector m_enterTags with tag names.<p>
*/
private void initialiseTags() {
StringTokenizer T = new StringTokenizer(
"p,table,tr,td,body,head,script,pre,title,style,h1,h2,h3,h4,h5,h6,ul,ol,li",
",");
while (T.hasMoreTokens()) {
m_enterTags.add(T.nextToken());
}
}
/**
* Private method to parse DOM and create user defined output.<p>
*
* @param node Node of DOM from HTML code
* @param properties the file properties
*/
private void printDocument(Node node, Hashtable properties) {
// if node is empty do nothing... (Recursion)
if (node == null) {
return;
}
// initialise local variables
int type = node.getNodeType();
String name = node.getNodeName();
// detect node type
switch (type) {
case Node.DOCUMENT_NODE:
this.printDocument(((Document)node).getDocumentElement(), properties);
break;
case Node.ELEMENT_NODE:
// check if its the <head> node. Nothing inside the <head> node
// must be
// part of the output, but we must scan the content of this
// node to get all
// <meta> tags
if (name.equals(NODE_HEAD)) {
m_write = false;
}
// scan element node; if a block has to be removed or replaced,
// break and discard child nodes
transformStartElement(node, properties);
// test if node has children
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
// recursively call printDocument with all child nodes
this.printDocument(children.item(i), properties);
}
}
break;
case Node.TEXT_NODE:
// replace subStrings in text nodes
transformTextNode(node);
break;
default:
break;
}
// end of recursion, add eventual endtags and suffixes
switch (type) {
case Node.ELEMENT_NODE:
// analyse endtags and add them to output
transformEndElement(node);
if (node.getNodeName().equals(NODE_HEAD)) {
m_write = true;
}
break;
case Node.DOCUMENT_NODE:
break;
default:
break;
}
}
/**
* Transform element nodes and create end tags in output.<p>
*
* @param node actual element node
*/
private void transformEndElement(Node node) {
// check hat kind of node we have
String nodeName = node.getNodeName();
// the <HTML> and <BODY> node must be skipped
if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
// do nothing here
} else {
// only do some output if we are in writing mode
if (m_write) {
m_tempString.append("</");
m_tempString.append(nodeName);
m_tempString.append(">");
// append a "\n" to output String if possible
if (m_enterTags.contains(node.getNodeName())) {
m_tempString.append("\n");
}
}
}
}
/**
* Transforms element nodes and create start tags in output. <p>
*
* @param node actual element node
* @param properties the file properties
*/
private void transformStartElement(Node node, Hashtable properties) {
// check hat kind of node we have
String nodeName = node.getNodeName();
// the <HTML> and <BODY> node must be skipped
if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
// the <TITLE> node must be read and its value set as properties to
// the imported file
} else if (nodeName.equals(NODE_TITLE)) {
writeTitleProperty(node, properties);
} else if (nodeName.equals(NODE_META)) {
writeMetaTagProperty(node, properties);
} else if (nodeName.equals(NODE_HREF)) {
// only do some output if we are in writing mode
if (m_write) {
m_tempString.append("<");
m_tempString.append(nodeName);
NamedNodeMap attrs = node.getAttributes();
// look through all attribs to find the reference
for (int i = attrs.getLength() - 1; i >= 0; i--) {
String name = attrs.item(i).getNodeName();
String value = attrs.item(i).getNodeValue();
if (name.equals(ATTRIB_HREF)) {
// check if this is an external link
if (value.indexOf("://") > 0) {
// store it for later creation of an entry in the
// link gallery
String externalLinkFile = m_htmlImport.storeExternalLink(value);
if (externalLinkFile != null) {
value = m_htmlImport.getLinkGallery() + externalLinkFile;
}
} else if (!value.startsWith("mailto:") && !value.startsWith("javascript:")) {
// save an existing anchor link for later use
// if (value.indexOf("#") > 0) {
// String anchor = value.substring(value.indexOf("#"), value.length());
// }
// get the new link into the VFS
String internalUri = m_htmlImport.getAbsoluteUri(value, m_filename.substring(
0,
m_filename.lastIndexOf("/") + 1));
value = m_htmlImport.translateLink(internalUri);
}
}
m_tempString.append(" ");
m_tempString.append(name);
m_tempString.append("=\"");
m_tempString.append(value);
m_tempString.append("\"");
}
m_tempString.append(">");
}
// this is a imasge, its reference must be converted
} else if (nodeName.equals(NODE_IMG)) {
// only do some output if we are in writing mode
if (m_write) {
m_tempString.append("<");
m_tempString.append(nodeName);
NamedNodeMap attrs = node.getAttributes();
// look through all attribs to find the src and alt attributes
String imagename = "";
String altText = "";
for (int i = attrs.getLength() - 1; i >= 0; i--) {
String name = attrs.item(i).getNodeName();
String value = attrs.item(i).getNodeValue();
if (name.equals(ATTRIB_SRC)) {
// we found the src. now check if it refers to an
// external image.
// if not, we must get the correct location in the VFS
if (value.indexOf("://") <= 0) {
imagename = m_htmlImport.getAbsoluteUri(value, m_filename.substring(
0,
m_filename.lastIndexOf("/") + 1));
value = m_htmlImport.translateLink(imagename);
}
} else if (name.equals(ATTRIB_ALT)) {
altText = value;
}
m_tempString.append(" ");
m_tempString.append(name);
m_tempString.append("=\"");
m_tempString.append(value);
m_tempString.append("\"");
}
//store the alt tag of this image for later use
m_htmlImport.storeImageInfo(imagename, altText);
m_tempString.append(">");
}
} else {
// only do some output if we are in writing mode
if (m_write) {
m_tempString.append("<");
m_tempString.append(nodeName);
NamedNodeMap attrs = node.getAttributes();
for (int i = attrs.getLength() - 1; i >= 0; i--) {
m_tempString.append(" " + attrs.item(i).getNodeName() + "=" + "\"");
/* scan attribute values and replace subStrings */
m_tempString.append(attrs.item(i).getNodeValue() + "\"");
}
m_tempString.append(">");
}
}
}
/**
* Private method to transform text nodes.<p>
*
* @param node actual text node
*/
private void transformTextNode(Node node) {
// only do some output if we are in writing mode
if (m_write) {
String helpString = node.getNodeValue();
m_tempString.append(helpString);
}
}
/**
* Writes meta tags as cms properties by analyzing the meta tags nodes.<p>
*
* @param node the meta tag node in html document
* @param properties the properties hashtable
*/
private void writeMetaTagProperty(Node node, Hashtable properties) {
NamedNodeMap attrs = node.getAttributes();
String metaName = "";
String metaContent = "";
// look through all attribs to find the name and content attributes
for (int i = attrs.getLength() - 1; i >= 0; i--) {
String name = attrs.item(i).getNodeName();
String value = attrs.item(i).getNodeValue();
if (name.equals(ATTRIB_NAME)) {
metaName = value;
} else if (name.equals(ATTRIB_CONTENT)) {
metaContent = value;
}
}
// check if we have valid entries for this <META> node, store them
// in the properties
if (metaName.length() > 0 && metaContent.length() > 0) {
properties.put(metaName, CmsStringUtil.substitute(metaContent, "{subst}", ""));
}
}
/**
* Sets the Property title by analyzing the title node.<p>
*
* @param node the title node in html document
* @param properties the properties hashtable
*/
private void writeTitleProperty(Node node, Hashtable properties) {
String title = "";
// the title string is stored in the first child node
NodeList children = node.getChildNodes();
if (children != null) {
Node titleNode = children.item(0);
if (titleNode != null) {
title = titleNode.getNodeValue();
}
}
// add the title property if we have one
if ((title != null) && (title.length() > 0)) {
properties.put(CmsPropertyDefinition.PROPERTY_TITLE, CmsStringUtil.substitute(title, "{subst}", ""));
// the title will be used as navtext if no other navtext is
// given
if (properties.get(CmsPropertyDefinition.PROPERTY_NAVTEXT) == null) {
properties.put(CmsPropertyDefinition.PROPERTY_NAVTEXT, CmsStringUtil.substitute(title, "{subst}", ""));
}
}
}
}