/*
* Copyright (c) 2008 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.batchresearch;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import nu.validator.htmlparser.common.DoctypeExpectation;
import nu.validator.htmlparser.common.DocumentMode;
import nu.validator.htmlparser.common.DocumentModeHandler;
import nu.validator.htmlparser.common.Heuristics;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.sax.HtmlParser;
import nu.validator.xml.AttributesPermutingXMLReaderWrapper;
import nu.validator.xml.dataattributes.DataAttributeDroppingSchemaWrapper;
import nu.validator.xml.langattributes.XmlLangAttributeDroppingSchemaWrapper;
import org.whattf.checker.jing.CheckerSchema;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import com.thaiopensource.relaxng.impl.CombineValidator;
import com.thaiopensource.util.PropertyMap;
import com.thaiopensource.util.PropertyMapBuilder;
import com.thaiopensource.validate.Schema;
import com.thaiopensource.validate.SchemaReader;
import com.thaiopensource.validate.ValidateProperty;
import com.thaiopensource.validate.Validator;
import com.thaiopensource.validate.rng.CompactSchemaReader;
public class ValidationWorker implements Runnable, ErrorHandler,
DocumentModeHandler {
private static final Pattern[] PATTERNS = {
Pattern.compile("Duplicate ID \u201C[^\u201D]*\u201D."),
Pattern.compile("Bad value \u201C[^\u201D]*\u201D for attribute "),
Pattern.compile("declares a duplicate ID value \u201C[^\u201D]*\u201D"),
Pattern.compile("The hashed ID reference in attribute \u201Cusemap\u201D referred to \u201C[^\u201D]*\u201D,"),
};
private static final String[] REPLACEMENTS = {
"Duplicate ID (consolidated).",
"Bad value (consolidated) for attribute ",
"declares a duplicate ID value (consolidated)",
"The hashed ID reference in attribute \u201Cusemap\u201D referred to (consolidated),"
};
private static String replaceSpecificValues(String str) {
if (str.startsWith("Malformed byte sequence: ")) {
return "Malformed byte sequence.";
} else if (str.startsWith("Unmappable byte sequence: ")) {
return "Unmappable byte sequence.";
}
for (int i = 0; i < PATTERNS.length; i++) {
Pattern p = PATTERNS[i];
Matcher m = p.matcher(str);
if (m.find()) {
return m.replaceFirst(REPLACEMENTS[i]);
}
}
return str;
}
private final CountingReadLine in;
private final PrintWriter out;
private final File rootDir;
private final XMLReader parser;
private final HashSet<String> parseErrors = new HashSet<String>();
private final HashSet<String> validationErrors = new HashSet<String>();
private String documentMode = null;
private Set<Schema> schemas;
/**
* @param in
* @param out
* @param rootDir
* @param resolver
*/
public ValidationWorker(CountingReadLine in, PrintWriter out, File rootDir,
Set<Schema> schemas) {
this.in = in;
this.out = out;
this.rootDir = rootDir;
this.schemas = schemas;
this.parser = setupParser();
}
private Validator setupValidator(Set<Schema> schemas) {
PropertyMapBuilder builder = new PropertyMapBuilder();
builder.put(ValidateProperty.ERROR_HANDLER, new ErrorHandler() {
public void error(SAXParseException exception) throws SAXException {
validationErrors.add(replaceSpecificValues(exception.getMessage()));
}
public void fatalError(SAXParseException exception)
throws SAXException {
// should not happen
validationErrors.add(replaceSpecificValues(exception.getMessage()));
}
public void warning(SAXParseException exception)
throws SAXException {
}});
PropertyMap map = builder.toPropertyMap();
Validator rv = null;
for (Schema schema : schemas) {
Validator v = schema.createValidator(map);
if (rv == null) {
rv = v;
} else {
rv = new CombineValidator(rv, v);
}
}
return rv;
}
private XMLReader setupParser() {
HtmlParser htmlParser = new HtmlParser();
htmlParser.setCommentPolicy(XmlViolationPolicy.ALLOW);
htmlParser.setContentNonXmlCharPolicy(XmlViolationPolicy.ALLOW);
htmlParser.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
htmlParser.setNamePolicy(XmlViolationPolicy.ALLOW);
htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.ALLOW);
htmlParser.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
htmlParser.setMappingLangToXmlLang(true);
htmlParser.setHeuristics(Heuristics.ALL);
htmlParser.setDoctypeExpectation(DoctypeExpectation.NO_DOCTYPE_ERRORS);
htmlParser.setCheckingNormalization(true);
htmlParser.setDocumentModeHandler(this);
XMLReader rv = new AttributesPermutingXMLReaderWrapper(htmlParser);
rv.setErrorHandler(this);
return rv;
}
public void run() {
String inLine = null;
for (;;) {
String url = null;
try {
while ((inLine = in.readLine()) != null) {
parseErrors.clear();
validationErrors.clear();
documentMode = null;
Validator validator = setupValidator(schemas);
parser.setContentHandler(validator.getContentHandler());
String md5;
String charset;
int firstTab = inLine.indexOf('\t');
int secondTab = inLine.indexOf('\t', firstTab + 1);
md5 = inLine.substring(0, firstTab);
url = inLine.substring(firstTab + 1, secondTab);
charset = inLine.substring(secondTab + 1, inLine.length());
InputSource is = new InputSource();
File top = new File(rootDir, md5.substring(0, 2));
File second = new File(top, md5.substring(2, 4));
File inFile = new File(second, md5 + ".gz");
is.setByteStream(new GZIPInputStream(new FileInputStream(
inFile)));
is.setSystemId(url);
if (!"null".equals(charset)) {
is.setEncoding(charset);
}
parser.parse(is);
validator = null;
StringBuilder sb = new StringBuilder();
boolean first = true;
if (parseErrors.isEmpty() && validationErrors.isEmpty()) {
if (!first) {
sb.append('\n');
} else {
first = false;
}
sb.append(url + '\t' + documentMode + "\tP\t"
+ "NEITHER ERRORS");
}
if (parseErrors.isEmpty()) {
if (!first) {
sb.append('\n');
} else {
first = false;
}
sb.append(url + '\t' + documentMode + "\tP\t"
+ "NO PARSE ERRORS");
} else {
for (String error : parseErrors) {
if (!first) {
sb.append('\n');
} else {
first = false;
}
sb.append(url + '\t' + documentMode + "\tP\t"
+ sanitize(error));
}
}
if (validationErrors.isEmpty()) {
if (!first) {
sb.append('\n');
} else {
first = false;
}
sb.append(url + '\t' + documentMode + "\tV\t"
+ "NO VALIDATION ERRORS");
} else {
for (String error : validationErrors) {
if (!first) {
sb.append('\n');
} else {
first = false;
}
sb.append(url + '\t' + documentMode + "\tV\t"
+ sanitize(error));
}
}
out.println(sb.toString());
}
return;
} catch (Throwable t) {
System.err.println(url);
t.printStackTrace();
}
}
}
private String sanitize(String error) {
return error.replaceAll("[\t\r\n]", " ");
}
public static void main(String[] args) throws Exception {
BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream(args[0]), "utf-8"));
PrintWriter out = new PrintWriter(new OutputStreamWriter(new GZIPOutputStream(
new FileOutputStream(args[1])), "utf-8"), true);
File rootDir = new File(args[2]);
Set<Schema> schemas = new HashSet<Schema>();
schemas.add(CheckerSchema.ASSERTION_SCH);
schemas.add(CheckerSchema.NORMALIZATION_CHECKER);
schemas.add(CheckerSchema.TABLE_CHECKER);
schemas.add(CheckerSchema.TEXT_CONTENT_CHECKER);
schemas.add(CheckerSchema.USEMAP_CHECKER);
InputSource is = new InputSource((new File(args[3])).toURL().toExternalForm());
SchemaReader sr = CompactSchemaReader.getInstance();
schemas.add(new XmlLangAttributeDroppingSchemaWrapper(new DataAttributeDroppingSchemaWrapper(sr.createSchema(is, PropertyMap.EMPTY))));
CountingReadLine countingReadLine = new CountingReadLine(in);
for (int i = 0; i < 4; i++) {
(new Thread(new ValidationWorker(countingReadLine, out, rootDir, schemas))).start();
}
}
public void error(SAXParseException exception) throws SAXException {
this.parseErrors.add(replaceSpecificValues(exception.getMessage()));
}
public void fatalError(SAXParseException exception) throws SAXException {
// This should never happen
this.parseErrors.add(replaceSpecificValues(exception.getMessage()));
}
public void warning(SAXParseException exception) throws SAXException {
}
public void documentMode(DocumentMode mode, String publicIdentifier,
String systemIdentifier, boolean html4SpecificAdditionalErrorChecks)
throws SAXException {
switch (mode) {
case ALMOST_STANDARDS_MODE:
documentMode = "A";
break;
case QUIRKS_MODE:
documentMode = "Q";
break;
case STANDARDS_MODE:
if (publicIdentifier == null && systemIdentifier == null) {
documentMode = "H";
} else {
documentMode = "S";
}
break;
}
}
}