/*
* $Id$
*
* Copyright 2006, The jCoderZ.org Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the jCoderZ.org Project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.jcoderz.commons.doclet;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;
/**
* This class provides an easy interface to jTidy to clean up
* html fragments as used within javadoc.
*
* @author Andreas Mandel
*/
public class HtmlCleaner
{
/** The full qualified name of this class. */
private static final String CLASSNAME = HtmlCleaner.class.getName();
/** The logger to use. */
private static final Logger logger = Logger.getLogger(CLASSNAME);
private static final String FIX_HEADER
= "<html><head><title>clean</title></head><body>";
private static final String FIX_FOOTER
= "</body></html>";
private String mWarnings = "";
private boolean mHasErrors = false;
/**
* Converts the given HTML fragment string into wellformed xhtml.
* @param in the html fragment to be cleaned up.
* @return a cleaned up wellformed xhtml version of the in string.
*/
public String clean (CharSequence in)
{
if (logger.isLoggable(Level.FINER))
{
logger.entering(CLASSNAME, "clean(CharSequence)", in);
}
mHasErrors = false;
final Tidy tidy = new Tidy();
final String inData = FIX_HEADER + in + FIX_FOOTER;
final StringWriter err = new StringWriter();
String result = null;
try
{
tidy.setCharEncoding(Configuration.UTF8);
tidy.setMakeClean(true);
tidy.setXmlOut(true);
tidy.setRawOut(true);
tidy.setNumEntities(true);
tidy.setWraplen(0); // do not care about line length
// tidy.setOnlyErrors(true);
tidy.setErrout(new PrintWriter(err));
final InputStream inStream = new ByteArrayInputStream(
inData.getBytes("utf-8"));
final ByteArrayOutputStream out = new ByteArrayOutputStream();
tidy.parse(inStream, out);
final String resultString = new String(out.toByteArray(), "utf-8");
final int start = resultString.indexOf("<body>");
final int end = resultString.lastIndexOf("</body>");
if (start != -1 && end != -1)
{
result = resultString.substring(
start + "<body>\n".length(), end).trim();
}
else
{
result = "Invalid HTML could not be parsed.";
}
if (tidy.getParseWarnings() == 0 && tidy.getParseErrors() == 0)
{
mWarnings = "";
}
else
{
mWarnings = err.toString();
}
mHasErrors = (tidy.getParseErrors() == 0);
}
catch (Exception ex)
{
result = "Invalid HTML could not be parsed.";
err.write(result);
err.write("Got exception:");
err.write(ex.toString());
ex.printStackTrace(new PrintWriter(err));
mWarnings = err.toString();
logger.log(Level.FINER,
"Could not handle html fragment. '" + in + "'." , ex);
mHasErrors = true;
}
if (logger.isLoggable(Level.FINER))
{
logger.exiting(CLASSNAME, "clean(CharSequence)", result);
}
return result;
}
/**
* Returns the warnings encountered during last clean.
* @return the warnings encountered during last clean.
*/
public String getWarnings ()
{
return mWarnings;
}
public boolean hasErrors ()
{
return mHasErrors;
}
}