ValidationWorker.java example

Explorer
validator-master
- resources
  - examples
    - research-src
      - nu
        validator
        batchresearch
        Analyze.java
        CountingReadLine.java
        IntWrap.java
        ValidationWorker.java
        dmozdl
        DmozDriver.java
        DmozHandler.java
        Downloader.java
        perftest
        DriverWrapper.java
        NullWriter.java
        ParserPerfHarness.java
        ParserPerfHarnessNew.java
        TokensToSax.java
        svgresearch
        NameTriple.java
        ScoreBoard.java
        SortStruct.java
        SvgAnalysisHandler.java
        SvgAnalyzer.java
        SvgDownloader.java
- src
  - nu
    - validator
/*
 * Copyright (c) 2008 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.batchresearch;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import nu.validator.htmlparser.common.DoctypeExpectation;
import nu.validator.htmlparser.common.DocumentMode;
import nu.validator.htmlparser.common.DocumentModeHandler;
import nu.validator.htmlparser.common.Heuristics;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.sax.HtmlParser;
import nu.validator.xml.AttributesPermutingXMLReaderWrapper;
import nu.validator.xml.dataattributes.DataAttributeDroppingSchemaWrapper;
import nu.validator.xml.langattributes.XmlLangAttributeDroppingSchemaWrapper;

import org.whattf.checker.jing.CheckerSchema;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;

import com.thaiopensource.relaxng.impl.CombineValidator;
import com.thaiopensource.util.PropertyMap;
import com.thaiopensource.util.PropertyMapBuilder;
import com.thaiopensource.validate.Schema;
import com.thaiopensource.validate.SchemaReader;
import com.thaiopensource.validate.ValidateProperty;
import com.thaiopensource.validate.Validator;
import com.thaiopensource.validate.rng.CompactSchemaReader;

public class ValidationWorker implements Runnable, ErrorHandler,
        DocumentModeHandler {

    private static final Pattern[] PATTERNS = {
      Pattern.compile("Duplicate ID \u201C[^\u201D]*\u201D."),
      Pattern.compile("Bad value \u201C[^\u201D]*\u201D for attribute "),
      Pattern.compile("declares a duplicate ID value \u201C[^\u201D]*\u201D"),
      Pattern.compile("The hashed ID reference in attribute \u201Cusemap\u201D referred to \u201C[^\u201D]*\u201D,"),
    };
    
    private static final String[] REPLACEMENTS = {
        "Duplicate ID (consolidated).",
        "Bad value (consolidated) for attribute ",
        "declares a duplicate ID value (consolidated)",
        "The hashed ID reference in attribute \u201Cusemap\u201D referred to (consolidated),"
    };
    
    private static String replaceSpecificValues(String str) {
        if (str.startsWith("Malformed byte sequence: ")) {
            return "Malformed byte sequence.";
        } else if (str.startsWith("Unmappable byte sequence: ")) {
            return "Unmappable byte sequence.";
        }
        for (int i = 0; i < PATTERNS.length; i++) {
            Pattern p = PATTERNS[i];
            Matcher m = p.matcher(str);
            if (m.find()) {
                return m.replaceFirst(REPLACEMENTS[i]);
            }
        }
        return str;
    }
    
    private final CountingReadLine in;

    private final PrintWriter out;

    private final File rootDir;

    private final XMLReader parser;

    private final HashSet<String> parseErrors = new HashSet<String>();

    private final HashSet<String> validationErrors = new HashSet<String>();
    
    private String documentMode = null;

    private Set<Schema> schemas;

    /**
     * @param in
     * @param out
     * @param rootDir
     * @param resolver
     */
    public ValidationWorker(CountingReadLine in, PrintWriter out, File rootDir,
            Set<Schema> schemas) {
        this.in = in;
        this.out = out;
        this.rootDir = rootDir;
        this.schemas = schemas;
        this.parser = setupParser();
    }

    private Validator setupValidator(Set<Schema> schemas) {
        PropertyMapBuilder builder = new PropertyMapBuilder();
        builder.put(ValidateProperty.ERROR_HANDLER, new ErrorHandler() {

            public void error(SAXParseException exception) throws SAXException {
                validationErrors.add(replaceSpecificValues(exception.getMessage()));
            }

            public void fatalError(SAXParseException exception)
                    throws SAXException {
                // should not happen
                validationErrors.add(replaceSpecificValues(exception.getMessage()));
            }

            public void warning(SAXParseException exception)
                    throws SAXException {
            }});
        PropertyMap map = builder.toPropertyMap();
        Validator rv = null;
        for (Schema schema : schemas) {
            Validator v = schema.createValidator(map);
            if (rv == null) {
                rv = v;
            } else {
                rv = new CombineValidator(rv, v);
            }
        }
        return rv;
    }

    private XMLReader setupParser() {
        HtmlParser htmlParser = new HtmlParser();
        htmlParser.setCommentPolicy(XmlViolationPolicy.ALLOW);
        htmlParser.setContentNonXmlCharPolicy(XmlViolationPolicy.ALLOW);
        htmlParser.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
        htmlParser.setNamePolicy(XmlViolationPolicy.ALLOW);
        htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.ALLOW);
        htmlParser.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
        htmlParser.setMappingLangToXmlLang(true);
        htmlParser.setHeuristics(Heuristics.ALL);
        htmlParser.setDoctypeExpectation(DoctypeExpectation.NO_DOCTYPE_ERRORS);
        htmlParser.setCheckingNormalization(true);
        htmlParser.setDocumentModeHandler(this);
        XMLReader rv = new AttributesPermutingXMLReaderWrapper(htmlParser);
        rv.setErrorHandler(this);
        return rv;
    }

    public void run() {
        String inLine = null;
        for (;;) {
            String url = null;
            try {
                while ((inLine = in.readLine()) != null) {
                    parseErrors.clear();
                    validationErrors.clear();
                    documentMode = null;

                    Validator validator = setupValidator(schemas);
                    parser.setContentHandler(validator.getContentHandler());
                    
                    String md5;
                    String charset;
                    int firstTab = inLine.indexOf('\t');
                    int secondTab = inLine.indexOf('\t', firstTab + 1);
                    md5 = inLine.substring(0, firstTab);
                    url = inLine.substring(firstTab + 1, secondTab);
                    charset = inLine.substring(secondTab + 1, inLine.length());

                    InputSource is = new InputSource();

                    File top = new File(rootDir, md5.substring(0, 2));
                    File second = new File(top, md5.substring(2, 4));
                    File inFile = new File(second, md5 + ".gz");

                    is.setByteStream(new GZIPInputStream(new FileInputStream(
                            inFile)));
                    is.setSystemId(url);
                    if (!"null".equals(charset)) {
                        is.setEncoding(charset);
                    }
                    
                    parser.parse(is);

                    validator = null;
                    
                    StringBuilder sb = new StringBuilder();
                    boolean first = true;
                    
                    if (parseErrors.isEmpty() && validationErrors.isEmpty()) {
                        if (!first) {
                            sb.append('\n');
                        } else {
                            first = false;                                
                        }
                        sb.append(url + '\t' + documentMode + "\tP\t"
                                + "NEITHER ERRORS");                        
                    }
                    if (parseErrors.isEmpty()) {
                        if (!first) {
                            sb.append('\n');
                        } else {
                            first = false;                                
                        }
                        sb.append(url + '\t' + documentMode + "\tP\t"
                                + "NO PARSE ERRORS");
                    } else {
                        for (String error : parseErrors) {
                            if (!first) {
                                sb.append('\n');
                            } else {
                                first = false;                                
                            }
                            sb.append(url + '\t' + documentMode + "\tP\t"
                                    + sanitize(error));
                        }
                    }
                    if (validationErrors.isEmpty()) {
                        if (!first) {
                            sb.append('\n');
                        } else {
                            first = false;                                
                        }
                        sb.append(url + '\t' + documentMode + "\tV\t"
                                + "NO VALIDATION ERRORS");
                    } else {
                        for (String error : validationErrors) {
                            if (!first) {
                                sb.append('\n');
                            } else {
                                first = false;                                
                            }
                            sb.append(url + '\t' + documentMode + "\tV\t"
                                    + sanitize(error));
                        }
                    }
                    out.println(sb.toString());
                }
                return;
            } catch (Throwable t) {
                System.err.println(url);
                t.printStackTrace();
            }
        }
    }

    private String sanitize(String error) {
        return error.replaceAll("[\t\r\n]", " ");
    }

    public static void main(String[] args) throws Exception {
        BufferedReader in = new BufferedReader(new InputStreamReader(
                new FileInputStream(args[0]), "utf-8"));
        PrintWriter out = new PrintWriter(new OutputStreamWriter(new GZIPOutputStream(
                new FileOutputStream(args[1])), "utf-8"), true);
        File rootDir = new File(args[2]);
        
        Set<Schema> schemas = new HashSet<Schema>();
        schemas.add(CheckerSchema.ASSERTION_SCH);
        schemas.add(CheckerSchema.NORMALIZATION_CHECKER);
        schemas.add(CheckerSchema.TABLE_CHECKER);
        schemas.add(CheckerSchema.TEXT_CONTENT_CHECKER);
        schemas.add(CheckerSchema.USEMAP_CHECKER);

        InputSource is = new InputSource((new File(args[3])).toURL().toExternalForm());
        SchemaReader sr = CompactSchemaReader.getInstance();
        schemas.add(new XmlLangAttributeDroppingSchemaWrapper(new DataAttributeDroppingSchemaWrapper(sr.createSchema(is, PropertyMap.EMPTY))));

        CountingReadLine countingReadLine = new CountingReadLine(in);
        
        for (int i = 0; i < 4; i++) {
            (new Thread(new ValidationWorker(countingReadLine, out, rootDir, schemas))).start();
        }
    }

    public void error(SAXParseException exception) throws SAXException {
        this.parseErrors.add(replaceSpecificValues(exception.getMessage()));
    }

    public void fatalError(SAXParseException exception) throws SAXException {
        // This should never happen 
        this.parseErrors.add(replaceSpecificValues(exception.getMessage()));
    }

    public void warning(SAXParseException exception) throws SAXException {

    }

    public void documentMode(DocumentMode mode, String publicIdentifier,
            String systemIdentifier, boolean html4SpecificAdditionalErrorChecks)
            throws SAXException {
        switch (mode) {
            case ALMOST_STANDARDS_MODE:
                documentMode = "A";
                break;
            case QUIRKS_MODE:
                documentMode = "Q";
                break;
            case STANDARDS_MODE:
                if (publicIdentifier == null && systemIdentifier == null) {
                    documentMode = "H";
                } else {
                    documentMode = "S";
                }
                break;
        }
    }
    
}