package com.fourlastor.dante.html; import com.fourlastor.dante.parser.ParseListener; import com.fourlastor.dante.parser.Parser; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import java.io.IOException; import java.io.StringReader; import java.util.Collections; import java.util.HashMap; import java.util.Map; class HtmlParser implements Parser, ContentHandler { private ParseListener listener; private StringBuilder buffer; @Override public void parse(String string) { org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); parser.setContentHandler(this); try { parser.parse(new InputSource(new StringReader(string))); } catch (IOException | SAXException e) { throw new HtmlParsingException(e); } emptyBuffer(); } private void emptyBuffer() { if (buffer != null) { String normalizedBuffer = buffer.toString().replaceAll("\\s+", " "); buffer = null; listener.characters(normalizedBuffer); } } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { emptyBuffer(); int attributesLength = attributes.getLength(); Map<String, String> attributesMap = new HashMap<>(attributesLength); for (int i = 0; i < attributesLength; i++) { attributesMap.put(attributes.getLocalName(i), attributes.getValue(i)); } listener.start(new HtmlBlock(localName, attributesMap)); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { emptyBuffer(); listener.end(new HtmlBlock(localName, Collections.<String, String>emptyMap())); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (buffer == null) { buffer = new StringBuilder(); } buffer.append(ch, start, length); } @Override public void register(ParseListener listener) { this.listener = listener; } @Override public void setDocumentLocator(Locator locator) {} @Override public void startDocument() throws SAXException {} @Override public void endDocument() throws SAXException {} @Override public void startPrefixMapping(String prefix, String uri) throws SAXException {} @Override public void endPrefixMapping(String prefix) throws SAXException {} @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {} @Override public void processingInstruction(String target, String data) throws SAXException {} @Override public void skippedEntity(String name) throws SAXException {} public static class HtmlParsingException extends RuntimeException { HtmlParsingException(Exception e) { super("HTML parsing failed", e); } } }