/*
* Copyright 2007-2008 Amazon Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://aws.amazon.com/apache2.0
*
* This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
* OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and
* limitations under the License.
*/
package com.amazonaws.mturk.addon;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.xerces.parsers.SAXParser;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.log4j.Logger;
import com.amazonaws.mturk.service.exception.ValidationException;
public class XhtmlValidator {
// -------------------------------------------------------------
// Constants - Private
// -------------------------------------------------------------
private static final String NS = "http://www.w3.org/1999/xhtml";
private static final String CDATA_HEADER = "<![CDATA[";
private static final String CDATA_FOOTER = "]]>";
protected static Logger log = Logger.getLogger(XhtmlValidator.class);
private static String XSD;
static {
XSD = "http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/FormattedContentXHTMLSubset.xsd";
}
/**
* validateAndClean validates the content against the FormattedContentXHTMLSubset.xsd and removes comments
* @param content
* @param requester
* @return
* @throws XHTMLParseErrorException
*/
public static String validateAndClean(String content)
throws ValidationException {
SAXParser parser = new SAXParser();
Validator handler = new Validator();
parser.setErrorHandler(handler);
Pattern p = Pattern.compile("<FormattedContent>.*?</FormattedContent>", Pattern.DOTALL);
String htmlContent = null;
try {
initializeParser(parser);
Matcher matcher = p.matcher(content);
while (matcher.find()) {
htmlContent = matcher.group(0);
content = removeComments(htmlContent, content);
htmlContent = insertNamespaceAndRemoveCDataTags(htmlContent);
try {
parser.parse(new InputSource(new StringReader(htmlContent)));
parser.reset(); //reset parser so we can use it for the next htmlContent
initializeParser(parser); //resetting sets parser back to factory settings, so we need to initialize it again
} catch (java.io.IOException e) {
String msg = "SAXParser exception: " + e + "\nhtmlContent: " + htmlContent;
log.error(msg);
throw new ValidationException(msg, e);
}
if (handler.validationError == true) {
String msg = "Validator error: " + handler.validationError + handler.saxParseException.getMessage()
+ "\nhtmlContent: " + htmlContent;
log.error(msg);
throw new ValidationException(msg);
}
}
} catch (SAXException e) {
String msg = "SAXParser exception: " + e + "\nhtmlContent: " + htmlContent;
log.error(msg);
throw new ValidationException(msg, e);
}
return content;
}
//-------------------------------------------------------------
// Methods - Private
//-------------------------------------------------------------
private static void initializeParser(SAXParser parser)
throws SAXException {
parser.setFeature("http://xml.org/sax/features/validation", true);
parser.setFeature("http://apache.org/xml/features/validation/schema", true);
parser.setFeature("http://apache.org/xml/features/validation/schema-full-checking", true);
parser.setProperty("http://apache.org/xml/properties/schema/external-schemaLocation", NS + " " + XSD);
}
private static String insertNamespaceAndRemoveCDataTags(String text) throws ValidationException {
String formattedContentHeader = "<FormattedContent>";
if (text.indexOf(CDATA_HEADER) != text.indexOf(formattedContentHeader) + formattedContentHeader.length()) {
String msg = "Missing or misplaced CDATA header: " + CDATA_HEADER + " in content :" + text;
log.error(msg);
throw new ValidationException(msg);
}
String formattedContentFooter = "</FormattedContent>";
if (text.indexOf(CDATA_FOOTER) + CDATA_FOOTER.length() != text.indexOf(formattedContentFooter) ) {
String msg = "Missing or misplaced CDATA footer: " + CDATA_FOOTER + " in content :" + text;
log.error(msg);
throw new ValidationException(msg);
}
return "<FormattedContent xmlns=\"" + NS + "\">"
+ text.substring(text.indexOf(CDATA_HEADER)
+ CDATA_HEADER.length(), text.indexOf(CDATA_FOOTER))
+ "</FormattedContent>";
}
private static String removeComments(String htmlContent, String question) {
int htmlContentStart = question.indexOf(htmlContent);
int htmlContentStop = htmlContentStart + htmlContent.length();
htmlContent = Pattern.compile("<!--.*?-->", Pattern.DOTALL).matcher(htmlContent).replaceAll("");
return question.substring(0, htmlContentStart) + htmlContent + question.substring(htmlContentStop, question.length());
}
//-------------------------------------------------------------
// Inner Classes
//-------------------------------------------------------------
/** Validator class for SaxParser **/
private static class Validator extends DefaultHandler {
public boolean validationError = false;
public SAXParseException saxParseException = null;
public void error(SAXParseException exception)
throws SAXException {
validationError = true;
saxParseException = exception;
}
public void fatalError(SAXParseException exception)
throws SAXException {
validationError = true;
saxParseException=exception;
}
public void warning(SAXParseException exception)
throws SAXException { }
}
}