Java Examples for nu.validator.htmlparser.sax.HtmlParser

The following java examples will help you to understand the usage of nu.validator.htmlparser.sax.HtmlParser. These source code samples are taken from different open source projects.

Example 1
Project: htmlparser-master  File: TreeTester.java View source code
private boolean runTest() throws Throwable {
    UntilHashInputStream stream = null;
    try {
        String context = null;
        boolean scriptingEnabled = true;
        boolean hadScriptingDirective = false;
        aggregateStream.mark(12288);
        if (skipLabel()) {
            // #data
            return false;
        }
        stream = new UntilHashInputStream(aggregateStream);
        while (stream.read() != -1) {
        // spin
        }
        if (skipLabel()) {
            // #errors
            System.err.println("Premature end of test data.");
            return false;
        }
        stream = new UntilHashInputStream(aggregateStream);
        while (stream.read() != -1) {
        // spin
        }
        StringBuilder sb = new StringBuilder();
        int c;
        while ((c = aggregateStream.read()) != '\n') {
            sb.append((char) c);
        }
        String label = sb.toString();
        if ("document-fragment".equals(label)) {
            sb.setLength(0);
            while ((c = aggregateStream.read()) != '\n') {
                sb.append((char) c);
            }
            context = sb.toString();
            // Now potentially gather #script-on/off
            sb.setLength(0);
            while ((c = aggregateStream.read()) != '\n') {
                sb.append((char) c);
            }
            label = sb.toString();
        }
        if ("script-on".equals(label)) {
            hadScriptingDirective = true;
        } else if ("script-off".equals(label)) {
            hadScriptingDirective = true;
            scriptingEnabled = false;
        }
        aggregateStream.reset();
        if (skipLabel()) {
            // #data
            System.err.println("Premature end of test data.");
            return false;
        }
        stream = new UntilHashInputStream(aggregateStream);
        InputSource is = new InputSource(stream);
        is.setEncoding("UTF-8");
        StringWriter sw = new StringWriter();
        ListErrorHandler leh = new ListErrorHandler();
        TreeDumpContentHandler treeDumpContentHandler = new TreeDumpContentHandler(sw);
        HtmlParser htmlParser = new HtmlParser(XmlViolationPolicy.ALLOW);
        if (streaming) {
            htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL);
        }
        htmlParser.setContentHandler(treeDumpContentHandler);
        htmlParser.setLexicalHandler(treeDumpContentHandler);
        htmlParser.setErrorHandler(leh);
        htmlParser.setScriptingEnabled(scriptingEnabled);
        try {
            if (context == null) {
                htmlParser.parse(is);
            } else {
                String ns = "http://www.w3.org/1999/xhtml";
                if (context.startsWith("svg ")) {
                    ns = "http://www.w3.org/2000/svg";
                    context = context.substring(4);
                } else if (context.startsWith("math ")) {
                    ns = "http://www.w3.org/1998/Math/MathML";
                    context = context.substring(5);
                }
                htmlParser.parseFragment(is, context, ns);
                treeDumpContentHandler.endDocument();
            }
        } catch (SAXParseException e) {
        }
        stream.close();
        if (skipLabel()) {
            // #errors
            System.err.println("Premature end of test data.");
            return false;
        }
        LinkedList<String> expectedErrors = new LinkedList<String>();
        BufferedReader br = new BufferedReader(new InputStreamReader(new UntilHashInputStream(aggregateStream), "UTF-8"));
        String line = null;
        while ((line = br.readLine()) != null) {
            expectedErrors.add(line);
        }
        if (context != null) {
            if (skipLabel()) {
                // #document-fragment
                System.err.println("Premature end of test data.");
                return false;
            }
            UntilHashInputStream stream2 = new UntilHashInputStream(aggregateStream);
            while (stream2.read() != -1) {
            // spin
            }
        }
        if (hadScriptingDirective && skipLabel()) {
            // #script-on/off
            System.err.println("Premature end of test data.");
            return false;
        }
        if (skipLabel()) {
            // #document
            System.err.println("Premature end of test data.");
            return false;
        }
        StringBuilder expectedBuilder = new StringBuilder();
        br = new BufferedReader(new InputStreamReader(new UntilHashInputStream(aggregateStream), "UTF-8"));
        int ch;
        while ((ch = br.read()) != -1) {
            expectedBuilder.append((char) ch);
        }
        String expected = expectedBuilder.toString();
        String actual = sw.toString();
        LinkedList<String> actualErrors = leh.getErrors();
        if (expected.equals(actual) || (streaming && leh.isFatal())) /*
             * && expectedErrors.size() ==
             * actualErrors.size()
             */
        {
            System.err.println("Success.");
        // System.err.println(stream);
        } else {
            System.err.print("Failure.\nData:\n" + stream + "\nExpected:\n" + expected + "Got: \n" + actual);
            System.err.println("Expected errors:");
            for (String err : expectedErrors) {
                System.err.println(err);
            }
            System.err.println("Actual errors:");
            for (String err : actualErrors) {
                System.err.println(err);
            }
        }
    } catch (Throwable t) {
        System.err.println("Failure.\nData:\n" + stream);
        throw t;
    }
    return true;
}
Example 2
Project: whole-master  File: HtmlPersistenceKit.java View source code
protected IEntity doReadModel(IPersistenceProvider pp) throws Exception {
    ModelBuilderOperation op = new ModelBuilderOperation();
    SaxConsumerHandler saxHandler = new SaxConsumerHandler(op, false);
    HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALLOW);
    parser.setContentHandler(saxHandler);
    parser.setProperty("http://xml.org/sax/properties/lexical-handler", saxHandler);
    parser.parse(new InputSource(pp.getInputStream()));
    IEntity xhtmlDocument = NormalizerOperation.normalize(op.wGetResult());
    return BehaviorUtils.apply("whole:org.whole.lang.html:HTML5Semantics#toHtml", xhtmlDocument);
}
Example 3
Project: zen-project-master  File: HtmlSaxPageViewResolver.java View source code
private Source saxSource(InputStream is) {
    if (html) {
        HtmlParser parser = new HtmlParser();
        parser.setMappingLangToXmlLang(true);
        parser.setReportingDoctype(false);
        InputSource inputSource = new InputSource(is);
        inputSource.setEncoding(UTF_8);
        SAXSource src = new SAXSource(new HtmlFragmentParser(parser), inputSource);
        return src;
    } else {
        return new StreamSource(is);
    }
}
Example 4
Project: wicket-stuff-markup-validator-master  File: DocType.java View source code
private static HtmlParser createHtmlParser(DoctypeExpectation docTypeExpectation) {
    HtmlParser htmlParser = new HtmlParser();
    htmlParser.setCommentPolicy(XmlViolationPolicy.ALLOW);
    htmlParser.setContentNonXmlCharPolicy(XmlViolationPolicy.ALLOW);
    htmlParser.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
    htmlParser.setNamePolicy(XmlViolationPolicy.ALLOW);
    htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL);
    htmlParser.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
    htmlParser.setMappingLangToXmlLang(true);
    htmlParser.setHtml4ModeCompatibleWithXhtml1Schemata(true);
    htmlParser.setHeuristics(Heuristics.ALL);
    htmlParser.setDoctypeExpectation(docTypeExpectation);
    return htmlParser;
}
Example 5
Project: wala-mirror-master  File: NuValidatorHtmlParser.java View source code
@Override
public void parse(final URL url, final Reader reader, final IHtmlCallback handler, final String fileName) {
    URL xx = null;
    try {
        xx = new URL("file://" + fileName);
    } catch (MalformedURLException e1) {
        e1.printStackTrace();
    }
    final URL localFileName = xx;
    HtmlParser parser = new HtmlParser();
    parser.setXmlPolicy(XmlViolationPolicy.ALLOW);
    parser.setContentHandler(new ContentHandler() {

        private Locator locator;

        private Stack<ITag> tags = new Stack<ITag>();

        ;

        private int countLines(char[] ch, int start, int length) {
            LineNumberReader r = new LineNumberReader(new StringReader(new String(ch, start, length)));
            try {
                while (r.read() > -1) ;
            } catch (IOException e) {
                throw new RuntimeException("cannot read from string", e);
            }
            return r.getLineNumber();
        }

        @Override
        public void setDocumentLocator(Locator locator) {
            this.locator = locator;
        }

        @Override
        public void startElement(String uri, final String localName, String qName, final Attributes atts) throws SAXException {
            final Position line = new LineNumberPosition(url, localFileName, locator.getLineNumber());
            tags.push(new ITag() {

                @Override
                public String getName() {
                    return localName;
                }

                @Override
                public Pair<String, Position> getAttributeByName(String name) {
                    if (atts.getValue(name) != null) {
                        return Pair.make(atts.getValue(name), line);
                    } else {
                        return null;
                    }
                }

                @Override
                public Map<String, Pair<String, Position>> getAllAttributes() {
                    return new AbstractMap<String, Pair<String, Position>>() {

                        private Set<Map.Entry<String, Pair<String, Position>>> es = null;

                        @Override
                        public Set<java.util.Map.Entry<String, Pair<String, Position>>> entrySet() {
                            if (es == null) {
                                es = new HashSet<Map.Entry<String, Pair<String, Position>>>();
                                for (int i = 0; i < atts.getLength(); i++) {
                                    final int index = i;
                                    es.add(new Map.Entry<String, Pair<String, Position>>() {

                                        @Override
                                        public String getKey() {
                                            return atts.getLocalName(index).toLowerCase();
                                        }

                                        @Override
                                        public Pair<String, Position> getValue() {
                                            if (atts.getValue(index) != null) {
                                                return Pair.make(atts.getValue(index), line);
                                            } else {
                                                return null;
                                            }
                                        }

                                        @Override
                                        public Pair<String, Position> setValue(Pair<String, Position> value) {
                                            throw new UnsupportedOperationException();
                                        }
                                    });
                                }
                            }
                            return es;
                        }
                    };
                }

                @Override
                public Position getElementPosition() {
                    return line;
                }

                @Override
                public Position getContentPosition() {
                    return line;
                }
            });
            handler.handleStartTag(tags.peek());
        }

        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            handler.handleEndTag(tags.pop());
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
            handler.handleText(new LineNumberPosition(url, localFileName, locator.getLineNumber() - countLines(ch, start, length)), new String(ch, start, length));
        }

        @Override
        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
            handler.handleText(new LineNumberPosition(url, localFileName, locator.getLineNumber()), new String(ch, start, length));
        }

        @Override
        public void startDocument() throws SAXException {
        // do nothing
        }

        @Override
        public void endDocument() throws SAXException {
        // do nothing
        }

        @Override
        public void startPrefixMapping(String prefix, String uri) throws SAXException {
        // do nothing
        }

        @Override
        public void endPrefixMapping(String prefix) throws SAXException {
        // do nothing
        }

        @Override
        public void processingInstruction(String target, String data) throws SAXException {
        // do nothing
        }

        @Override
        public void skippedEntity(String name) throws SAXException {
        // do nothing
        }
    });
    try {
        parser.parse(new InputSource(new InputStream() {

            @Override
            public int read() throws IOException {
                int v;
                do {
                    v = reader.read();
                } while (v == '\r');
                return v;
            }
        }));
    } catch (IOException e) {
        assert false : e.toString();
    } catch (SAXException e) {
        assert false : e.toString();
    }
}
Example 6
Project: validator-master  File: VerifierServletTransaction.java View source code
/**
     * 
     */
protected void newHtmlParser() {
    htmlParser = new HtmlParser();
    htmlParser.setCommentPolicy(XmlViolationPolicy.ALLOW);
    htmlParser.setContentNonXmlCharPolicy(XmlViolationPolicy.ALLOW);
    htmlParser.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
    htmlParser.setNamePolicy(XmlViolationPolicy.ALLOW);
    htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL);
    htmlParser.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
    htmlParser.setMappingLangToXmlLang(true);
    htmlParser.setHtml4ModeCompatibleWithXhtml1Schemata(true);
    htmlParser.setHeuristics(Heuristics.ALL);
}
Example 7
Project: DataCleaner-master  File: MainTest.java View source code
public void testWriteHtmlToFile() throws Throwable {
    final String filename = "target/test_write_html_to_file.html";
    Main.main(("-conf src/test/resources/cli-examples/conf.xml -job src/test/resources/cli-examples/employees_job.xml -of " + filename + " -ot HTML").split(" "));
    final File file = new File(filename);
    assertTrue(file.exists());
    {
        final String result = FileHelper.readFileAsString(file);
        final String[] lines = result.split("\n");
        assertEquals("<html>", lines[1]);
    }
    try (InputStream in = FileHelper.getInputStream(file)) {
        // parse it with validator.nu for HTML correctness
        final HtmlParser htmlParser = new HtmlParser(XmlViolationPolicy.FATAL);
        final AtomicInteger elementCounter = new AtomicInteger();
        htmlParser.setContentHandler(new DefaultHandler() {

            @Override
            public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) throws SAXException {
                elementCounter.incrementAndGet();
            }
        });
        final List<Exception> warningsAndErrors = new ArrayList<>();
        htmlParser.setErrorHandler(new ErrorHandler() {

            @Override
            public void warning(final SAXParseException exception) throws SAXException {
                System.err.println("Warning: " + exception.getMessage());
                warningsAndErrors.add(exception);
            }

            @Override
            public void fatalError(final SAXParseException exception) throws SAXException {
                System.out.println("Fatal error: " + exception.getMessage());
                throw exception;
            }

            @Override
            public void error(final SAXParseException exception) throws SAXException {
                System.err.println("Error: " + exception.getMessage());
                warningsAndErrors.add(exception);
            }
        });
        htmlParser.parse(new InputSource(in));
        // the output has approx 3600 XML elements
        final int elementCount = elementCounter.get();
        assertTrue("Element count: " + elementCount, elementCount > 3000);
        assertTrue("Element count: " + elementCount, elementCount < 5000);
        if (!warningsAndErrors.isEmpty()) {
            for (final Exception error : warningsAndErrors) {
                final String message = error.getMessage();
                if (message.startsWith("No explicit character encoding declaration has been seen yet") || message.startsWith("The character encoding of the document was not declared.")) {
                    // ignore/accept this one
                    continue;
                }
                error.printStackTrace();
                fail("Got " + warningsAndErrors.size() + " warnings and errors, see log for details");
            }
        }
    }
}
Example 8
Project: lux-master  File: XQueryComponent.java View source code
private void handleContentStreams(LinkedTreeBuilder builder, SolrQueryRequest req, ArrayList<XdmItem> result, Evaluator evaluator) throws XPathException {
    // parts
    int i = 0;
    for (ContentStream stream : req.getContentStreams()) {
        String contentType = stream.getContentType();
        //String name = stream.getName();
        byte[] partBytes = null;
        try {
            partBytes = IOUtils.toByteArray(stream.getStream(), stream.getSize());
        } catch (IOException e) {
            throw new LuxException(e);
        }
        String charset = ContentStreamBase.getCharsetFromContentType(contentType);
        if (charset == null) {
            charset = "utf-8";
        }
        if (!isText(contentType)) {
            logger.warn("Binary values not supported; treating " + contentType + " as xml, or text");
        }
        XdmItem part = null;
        if (isXML(contentType) || !isText(contentType)) {
            try {
                part = evaluator.build(new ByteArrayInputStream(partBytes), "#part" + i);
            } catch (LuxException e) {
                logger.warn("Caught an exception while parsing XML: " + e.getMessage() + ", treating it as plain text");
                contentType = "text/plain; charset=" + charset;
            }
        }
        if (part == null) {
            String text;
            try {
                text = new String(partBytes, charset);
            } catch (UnsupportedEncodingException e1) {
                throw new LuxException(e1);
            }
            if (isHTML(contentType)) {
                HtmlParser parser = new HtmlParser();
                //Parser parser = new Parser();
                SAXSource source = new SAXSource(parser, new InputSource(new StringReader(text)));
                try {
                    part = evaluator.getDocBuilder().build(source);
                } catch (SaxonApiException e) {
                    e.printStackTrace();
                    logger.warn("failed to parse HTML; treating as plain text: " + e.getMessage());
                }
            }
            if (part == null) {
                TextFragmentValue node = new TextFragmentValue(text, "#part" + i);
                node.setConfiguration(builder.getConfiguration());
                part = new XdmNode(node);
            }
        }
        result.add(part);
        builder.startElement(fQNameFor("http", EXPATH_HTTP_NS, "body"), BuiltInAtomicType.UNTYPED_ATOMIC, 0, 0);
        addAttribute(builder, "position", "1");
        addAttribute(builder, "content-type", contentType);
        builder.startContent();
        builder.endElement();
    }
}
Example 9
Project: fcrepo4-master  File: FedoraLdpIT.java View source code
private static void validateHTML(final String path) throws IOException, SAXException {
    final HttpGet getMethod = getObjMethod(path);
    getMethod.addHeader(ACCEPT, "text/html");
    try (final CloseableHttpResponse response = execute(getMethod)) {
        assertEquals(OK.getStatusCode(), getStatus(response));
        final String content = EntityUtils.toString(response.getEntity());
        logger.trace("Retrieved HTML view:\n" + content);
        final HtmlParser htmlParser = new HtmlParser(ALLOW);
        htmlParser.setDoctypeExpectation(NO_DOCTYPE_ERRORS);
        htmlParser.setErrorHandler(new HTMLErrorHandler());
        htmlParser.setContentHandler(new TreeBuilder());
        try (final InputStream htmlStream = new ByteArrayInputStream(content.getBytes(UTF_8))) {
            htmlParser.parse(new InputSource(htmlStream));
        }
        logger.debug("HTML found to be valid.");
    }
}
Example 10
Project: AnalyzerBeans-master  File: MainTest.java View source code
public void testWriteHtmlToFile() throws Throwable {
    String filename = "target/test_write_html_to_file.html";
    Main.main(("-conf examples/conf.xml -job examples/employees_job.xml -of " + filename + " -ot HTML").split(" "));
    File file = new File(filename);
    assertTrue(file.exists());
    {
        String result = FileHelper.readFileAsString(file);
        String[] lines = result.split("\n");
        assertEquals("<html>", lines[1]);
    }
    InputStream in = FileHelper.getInputStream(file);
    try {
        // parse it with validator.nu for HTML correctness
        final HtmlParser htmlParser = new HtmlParser(XmlViolationPolicy.FATAL);
        final AtomicInteger elementCounter = new AtomicInteger();
        htmlParser.setContentHandler(new DefaultHandler() {

            @Override
            public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
                elementCounter.incrementAndGet();
            }
        });
        final List<Exception> warningsAndErrors = new ArrayList<Exception>();
        htmlParser.setErrorHandler(new ErrorHandler() {

            @Override
            public void warning(SAXParseException exception) throws SAXException {
                System.err.println("Warning: " + exception.getMessage());
                warningsAndErrors.add(exception);
            }

            @Override
            public void fatalError(SAXParseException exception) throws SAXException {
                System.out.println("Fatal error: " + exception.getMessage());
                throw exception;
            }

            @Override
            public void error(SAXParseException exception) throws SAXException {
                System.err.println("Error: " + exception.getMessage());
                warningsAndErrors.add(exception);
            }
        });
        htmlParser.parse(new InputSource(in));
        // the output has approx 3600 XML elements
        int elementCount = elementCounter.get();
        assertTrue("Element count: " + elementCount, elementCount > 3000);
        assertTrue("Element count: " + elementCount, elementCount < 5000);
        if (!warningsAndErrors.isEmpty()) {
            for (Exception error : warningsAndErrors) {
                String message = error.getMessage();
                if (message.startsWith("No explicit character encoding declaration has been seen yet") || message.startsWith("The character encoding of the document was not declared.")) {
                    // ignore/accept this one
                    continue;
                }
                error.printStackTrace();
                fail("Got " + warningsAndErrors.size() + " warnings and errors, see log for details");
            }
        }
    } finally {
        in.close();
    }
}