Java Examples for nu.validator.htmlparser.sax.HtmlParser
The following java examples will help you to understand the usage of nu.validator.htmlparser.sax.HtmlParser. These source code samples are taken from different open source projects.
Example 1
| Project: htmlparser-master File: TreeTester.java View source code |
private boolean runTest() throws Throwable {
UntilHashInputStream stream = null;
try {
String context = null;
boolean scriptingEnabled = true;
boolean hadScriptingDirective = false;
aggregateStream.mark(12288);
if (skipLabel()) {
// #data
return false;
}
stream = new UntilHashInputStream(aggregateStream);
while (stream.read() != -1) {
// spin
}
if (skipLabel()) {
// #errors
System.err.println("Premature end of test data.");
return false;
}
stream = new UntilHashInputStream(aggregateStream);
while (stream.read() != -1) {
// spin
}
StringBuilder sb = new StringBuilder();
int c;
while ((c = aggregateStream.read()) != '\n') {
sb.append((char) c);
}
String label = sb.toString();
if ("document-fragment".equals(label)) {
sb.setLength(0);
while ((c = aggregateStream.read()) != '\n') {
sb.append((char) c);
}
context = sb.toString();
// Now potentially gather #script-on/off
sb.setLength(0);
while ((c = aggregateStream.read()) != '\n') {
sb.append((char) c);
}
label = sb.toString();
}
if ("script-on".equals(label)) {
hadScriptingDirective = true;
} else if ("script-off".equals(label)) {
hadScriptingDirective = true;
scriptingEnabled = false;
}
aggregateStream.reset();
if (skipLabel()) {
// #data
System.err.println("Premature end of test data.");
return false;
}
stream = new UntilHashInputStream(aggregateStream);
InputSource is = new InputSource(stream);
is.setEncoding("UTF-8");
StringWriter sw = new StringWriter();
ListErrorHandler leh = new ListErrorHandler();
TreeDumpContentHandler treeDumpContentHandler = new TreeDumpContentHandler(sw);
HtmlParser htmlParser = new HtmlParser(XmlViolationPolicy.ALLOW);
if (streaming) {
htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL);
}
htmlParser.setContentHandler(treeDumpContentHandler);
htmlParser.setLexicalHandler(treeDumpContentHandler);
htmlParser.setErrorHandler(leh);
htmlParser.setScriptingEnabled(scriptingEnabled);
try {
if (context == null) {
htmlParser.parse(is);
} else {
String ns = "http://www.w3.org/1999/xhtml";
if (context.startsWith("svg ")) {
ns = "http://www.w3.org/2000/svg";
context = context.substring(4);
} else if (context.startsWith("math ")) {
ns = "http://www.w3.org/1998/Math/MathML";
context = context.substring(5);
}
htmlParser.parseFragment(is, context, ns);
treeDumpContentHandler.endDocument();
}
} catch (SAXParseException e) {
}
stream.close();
if (skipLabel()) {
// #errors
System.err.println("Premature end of test data.");
return false;
}
LinkedList<String> expectedErrors = new LinkedList<String>();
BufferedReader br = new BufferedReader(new InputStreamReader(new UntilHashInputStream(aggregateStream), "UTF-8"));
String line = null;
while ((line = br.readLine()) != null) {
expectedErrors.add(line);
}
if (context != null) {
if (skipLabel()) {
// #document-fragment
System.err.println("Premature end of test data.");
return false;
}
UntilHashInputStream stream2 = new UntilHashInputStream(aggregateStream);
while (stream2.read() != -1) {
// spin
}
}
if (hadScriptingDirective && skipLabel()) {
// #script-on/off
System.err.println("Premature end of test data.");
return false;
}
if (skipLabel()) {
// #document
System.err.println("Premature end of test data.");
return false;
}
StringBuilder expectedBuilder = new StringBuilder();
br = new BufferedReader(new InputStreamReader(new UntilHashInputStream(aggregateStream), "UTF-8"));
int ch;
while ((ch = br.read()) != -1) {
expectedBuilder.append((char) ch);
}
String expected = expectedBuilder.toString();
String actual = sw.toString();
LinkedList<String> actualErrors = leh.getErrors();
if (expected.equals(actual) || (streaming && leh.isFatal())) /*
* && expectedErrors.size() ==
* actualErrors.size()
*/
{
System.err.println("Success.");
// System.err.println(stream);
} else {
System.err.print("Failure.\nData:\n" + stream + "\nExpected:\n" + expected + "Got: \n" + actual);
System.err.println("Expected errors:");
for (String err : expectedErrors) {
System.err.println(err);
}
System.err.println("Actual errors:");
for (String err : actualErrors) {
System.err.println(err);
}
}
} catch (Throwable t) {
System.err.println("Failure.\nData:\n" + stream);
throw t;
}
return true;
}Example 2
| Project: whole-master File: HtmlPersistenceKit.java View source code |
protected IEntity doReadModel(IPersistenceProvider pp) throws Exception {
ModelBuilderOperation op = new ModelBuilderOperation();
SaxConsumerHandler saxHandler = new SaxConsumerHandler(op, false);
HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALLOW);
parser.setContentHandler(saxHandler);
parser.setProperty("http://xml.org/sax/properties/lexical-handler", saxHandler);
parser.parse(new InputSource(pp.getInputStream()));
IEntity xhtmlDocument = NormalizerOperation.normalize(op.wGetResult());
return BehaviorUtils.apply("whole:org.whole.lang.html:HTML5Semantics#toHtml", xhtmlDocument);
}Example 3
| Project: zen-project-master File: HtmlSaxPageViewResolver.java View source code |
private Source saxSource(InputStream is) {
if (html) {
HtmlParser parser = new HtmlParser();
parser.setMappingLangToXmlLang(true);
parser.setReportingDoctype(false);
InputSource inputSource = new InputSource(is);
inputSource.setEncoding(UTF_8);
SAXSource src = new SAXSource(new HtmlFragmentParser(parser), inputSource);
return src;
} else {
return new StreamSource(is);
}
}Example 4
| Project: wicket-stuff-markup-validator-master File: DocType.java View source code |
private static HtmlParser createHtmlParser(DoctypeExpectation docTypeExpectation) { HtmlParser htmlParser = new HtmlParser(); htmlParser.setCommentPolicy(XmlViolationPolicy.ALLOW); htmlParser.setContentNonXmlCharPolicy(XmlViolationPolicy.ALLOW); htmlParser.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET); htmlParser.setNamePolicy(XmlViolationPolicy.ALLOW); htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL); htmlParser.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET); htmlParser.setMappingLangToXmlLang(true); htmlParser.setHtml4ModeCompatibleWithXhtml1Schemata(true); htmlParser.setHeuristics(Heuristics.ALL); htmlParser.setDoctypeExpectation(docTypeExpectation); return htmlParser; }
Example 5
| Project: wala-mirror-master File: NuValidatorHtmlParser.java View source code |
@Override
public void parse(final URL url, final Reader reader, final IHtmlCallback handler, final String fileName) {
URL xx = null;
try {
xx = new URL("file://" + fileName);
} catch (MalformedURLException e1) {
e1.printStackTrace();
}
final URL localFileName = xx;
HtmlParser parser = new HtmlParser();
parser.setXmlPolicy(XmlViolationPolicy.ALLOW);
parser.setContentHandler(new ContentHandler() {
private Locator locator;
private Stack<ITag> tags = new Stack<ITag>();
;
private int countLines(char[] ch, int start, int length) {
LineNumberReader r = new LineNumberReader(new StringReader(new String(ch, start, length)));
try {
while (r.read() > -1) ;
} catch (IOException e) {
throw new RuntimeException("cannot read from string", e);
}
return r.getLineNumber();
}
@Override
public void setDocumentLocator(Locator locator) {
this.locator = locator;
}
@Override
public void startElement(String uri, final String localName, String qName, final Attributes atts) throws SAXException {
final Position line = new LineNumberPosition(url, localFileName, locator.getLineNumber());
tags.push(new ITag() {
@Override
public String getName() {
return localName;
}
@Override
public Pair<String, Position> getAttributeByName(String name) {
if (atts.getValue(name) != null) {
return Pair.make(atts.getValue(name), line);
} else {
return null;
}
}
@Override
public Map<String, Pair<String, Position>> getAllAttributes() {
return new AbstractMap<String, Pair<String, Position>>() {
private Set<Map.Entry<String, Pair<String, Position>>> es = null;
@Override
public Set<java.util.Map.Entry<String, Pair<String, Position>>> entrySet() {
if (es == null) {
es = new HashSet<Map.Entry<String, Pair<String, Position>>>();
for (int i = 0; i < atts.getLength(); i++) {
final int index = i;
es.add(new Map.Entry<String, Pair<String, Position>>() {
@Override
public String getKey() {
return atts.getLocalName(index).toLowerCase();
}
@Override
public Pair<String, Position> getValue() {
if (atts.getValue(index) != null) {
return Pair.make(atts.getValue(index), line);
} else {
return null;
}
}
@Override
public Pair<String, Position> setValue(Pair<String, Position> value) {
throw new UnsupportedOperationException();
}
});
}
}
return es;
}
};
}
@Override
public Position getElementPosition() {
return line;
}
@Override
public Position getContentPosition() {
return line;
}
});
handler.handleStartTag(tags.peek());
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
handler.handleEndTag(tags.pop());
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
handler.handleText(new LineNumberPosition(url, localFileName, locator.getLineNumber() - countLines(ch, start, length)), new String(ch, start, length));
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
handler.handleText(new LineNumberPosition(url, localFileName, locator.getLineNumber()), new String(ch, start, length));
}
@Override
public void startDocument() throws SAXException {
// do nothing
}
@Override
public void endDocument() throws SAXException {
// do nothing
}
@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
// do nothing
}
@Override
public void endPrefixMapping(String prefix) throws SAXException {
// do nothing
}
@Override
public void processingInstruction(String target, String data) throws SAXException {
// do nothing
}
@Override
public void skippedEntity(String name) throws SAXException {
// do nothing
}
});
try {
parser.parse(new InputSource(new InputStream() {
@Override
public int read() throws IOException {
int v;
do {
v = reader.read();
} while (v == '\r');
return v;
}
}));
} catch (IOException e) {
assert false : e.toString();
} catch (SAXException e) {
assert false : e.toString();
}
}Example 6
| Project: validator-master File: VerifierServletTransaction.java View source code |
/**
*
*/
protected void newHtmlParser() {
htmlParser = new HtmlParser();
htmlParser.setCommentPolicy(XmlViolationPolicy.ALLOW);
htmlParser.setContentNonXmlCharPolicy(XmlViolationPolicy.ALLOW);
htmlParser.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
htmlParser.setNamePolicy(XmlViolationPolicy.ALLOW);
htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL);
htmlParser.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
htmlParser.setMappingLangToXmlLang(true);
htmlParser.setHtml4ModeCompatibleWithXhtml1Schemata(true);
htmlParser.setHeuristics(Heuristics.ALL);
}Example 7
| Project: DataCleaner-master File: MainTest.java View source code |
public void testWriteHtmlToFile() throws Throwable {
final String filename = "target/test_write_html_to_file.html";
Main.main(("-conf src/test/resources/cli-examples/conf.xml -job src/test/resources/cli-examples/employees_job.xml -of " + filename + " -ot HTML").split(" "));
final File file = new File(filename);
assertTrue(file.exists());
{
final String result = FileHelper.readFileAsString(file);
final String[] lines = result.split("\n");
assertEquals("<html>", lines[1]);
}
try (InputStream in = FileHelper.getInputStream(file)) {
// parse it with validator.nu for HTML correctness
final HtmlParser htmlParser = new HtmlParser(XmlViolationPolicy.FATAL);
final AtomicInteger elementCounter = new AtomicInteger();
htmlParser.setContentHandler(new DefaultHandler() {
@Override
public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) throws SAXException {
elementCounter.incrementAndGet();
}
});
final List<Exception> warningsAndErrors = new ArrayList<>();
htmlParser.setErrorHandler(new ErrorHandler() {
@Override
public void warning(final SAXParseException exception) throws SAXException {
System.err.println("Warning: " + exception.getMessage());
warningsAndErrors.add(exception);
}
@Override
public void fatalError(final SAXParseException exception) throws SAXException {
System.out.println("Fatal error: " + exception.getMessage());
throw exception;
}
@Override
public void error(final SAXParseException exception) throws SAXException {
System.err.println("Error: " + exception.getMessage());
warningsAndErrors.add(exception);
}
});
htmlParser.parse(new InputSource(in));
// the output has approx 3600 XML elements
final int elementCount = elementCounter.get();
assertTrue("Element count: " + elementCount, elementCount > 3000);
assertTrue("Element count: " + elementCount, elementCount < 5000);
if (!warningsAndErrors.isEmpty()) {
for (final Exception error : warningsAndErrors) {
final String message = error.getMessage();
if (message.startsWith("No explicit character encoding declaration has been seen yet") || message.startsWith("The character encoding of the document was not declared.")) {
// ignore/accept this one
continue;
}
error.printStackTrace();
fail("Got " + warningsAndErrors.size() + " warnings and errors, see log for details");
}
}
}
}Example 8
| Project: lux-master File: XQueryComponent.java View source code |
private void handleContentStreams(LinkedTreeBuilder builder, SolrQueryRequest req, ArrayList<XdmItem> result, Evaluator evaluator) throws XPathException {
// parts
int i = 0;
for (ContentStream stream : req.getContentStreams()) {
String contentType = stream.getContentType();
//String name = stream.getName();
byte[] partBytes = null;
try {
partBytes = IOUtils.toByteArray(stream.getStream(), stream.getSize());
} catch (IOException e) {
throw new LuxException(e);
}
String charset = ContentStreamBase.getCharsetFromContentType(contentType);
if (charset == null) {
charset = "utf-8";
}
if (!isText(contentType)) {
logger.warn("Binary values not supported; treating " + contentType + " as xml, or text");
}
XdmItem part = null;
if (isXML(contentType) || !isText(contentType)) {
try {
part = evaluator.build(new ByteArrayInputStream(partBytes), "#part" + i);
} catch (LuxException e) {
logger.warn("Caught an exception while parsing XML: " + e.getMessage() + ", treating it as plain text");
contentType = "text/plain; charset=" + charset;
}
}
if (part == null) {
String text;
try {
text = new String(partBytes, charset);
} catch (UnsupportedEncodingException e1) {
throw new LuxException(e1);
}
if (isHTML(contentType)) {
HtmlParser parser = new HtmlParser();
//Parser parser = new Parser();
SAXSource source = new SAXSource(parser, new InputSource(new StringReader(text)));
try {
part = evaluator.getDocBuilder().build(source);
} catch (SaxonApiException e) {
e.printStackTrace();
logger.warn("failed to parse HTML; treating as plain text: " + e.getMessage());
}
}
if (part == null) {
TextFragmentValue node = new TextFragmentValue(text, "#part" + i);
node.setConfiguration(builder.getConfiguration());
part = new XdmNode(node);
}
}
result.add(part);
builder.startElement(fQNameFor("http", EXPATH_HTTP_NS, "body"), BuiltInAtomicType.UNTYPED_ATOMIC, 0, 0);
addAttribute(builder, "position", "1");
addAttribute(builder, "content-type", contentType);
builder.startContent();
builder.endElement();
}
}Example 9
| Project: fcrepo4-master File: FedoraLdpIT.java View source code |
private static void validateHTML(final String path) throws IOException, SAXException {
final HttpGet getMethod = getObjMethod(path);
getMethod.addHeader(ACCEPT, "text/html");
try (final CloseableHttpResponse response = execute(getMethod)) {
assertEquals(OK.getStatusCode(), getStatus(response));
final String content = EntityUtils.toString(response.getEntity());
logger.trace("Retrieved HTML view:\n" + content);
final HtmlParser htmlParser = new HtmlParser(ALLOW);
htmlParser.setDoctypeExpectation(NO_DOCTYPE_ERRORS);
htmlParser.setErrorHandler(new HTMLErrorHandler());
htmlParser.setContentHandler(new TreeBuilder());
try (final InputStream htmlStream = new ByteArrayInputStream(content.getBytes(UTF_8))) {
htmlParser.parse(new InputSource(htmlStream));
}
logger.debug("HTML found to be valid.");
}
}Example 10
| Project: AnalyzerBeans-master File: MainTest.java View source code |
public void testWriteHtmlToFile() throws Throwable {
String filename = "target/test_write_html_to_file.html";
Main.main(("-conf examples/conf.xml -job examples/employees_job.xml -of " + filename + " -ot HTML").split(" "));
File file = new File(filename);
assertTrue(file.exists());
{
String result = FileHelper.readFileAsString(file);
String[] lines = result.split("\n");
assertEquals("<html>", lines[1]);
}
InputStream in = FileHelper.getInputStream(file);
try {
// parse it with validator.nu for HTML correctness
final HtmlParser htmlParser = new HtmlParser(XmlViolationPolicy.FATAL);
final AtomicInteger elementCounter = new AtomicInteger();
htmlParser.setContentHandler(new DefaultHandler() {
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
elementCounter.incrementAndGet();
}
});
final List<Exception> warningsAndErrors = new ArrayList<Exception>();
htmlParser.setErrorHandler(new ErrorHandler() {
@Override
public void warning(SAXParseException exception) throws SAXException {
System.err.println("Warning: " + exception.getMessage());
warningsAndErrors.add(exception);
}
@Override
public void fatalError(SAXParseException exception) throws SAXException {
System.out.println("Fatal error: " + exception.getMessage());
throw exception;
}
@Override
public void error(SAXParseException exception) throws SAXException {
System.err.println("Error: " + exception.getMessage());
warningsAndErrors.add(exception);
}
});
htmlParser.parse(new InputSource(in));
// the output has approx 3600 XML elements
int elementCount = elementCounter.get();
assertTrue("Element count: " + elementCount, elementCount > 3000);
assertTrue("Element count: " + elementCount, elementCount < 5000);
if (!warningsAndErrors.isEmpty()) {
for (Exception error : warningsAndErrors) {
String message = error.getMessage();
if (message.startsWith("No explicit character encoding declaration has been seen yet") || message.startsWith("The character encoding of the document was not declared.")) {
// ignore/accept this one
continue;
}
error.printStackTrace();
fail("Got " + warningsAndErrors.size() + " warnings and errors, see log for details");
}
}
} finally {
in.close();
}
}