package org.icij.extract.parser; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.icij.extract.document.Document; import org.icij.extract.document.EmbeddedDocument; import org.icij.extract.encoder.DataURIEncodingInputStream; import java.io.IOException; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParseContext; import org.apache.tika.metadata.Metadata; import org.apache.tika.io.TikaInputStream; import org.apache.tika.sax.ContentHandlerDecorator; import org.icij.extract.sax.HTML5Serializer; import org.xml.sax.Attributes; import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.icij.kaxxa.io.TokenReplacingReader; import static org.apache.tika.sax.XHTMLContentHandler.XHTML; /** * Example: * * <code> * final String uuid = UUID.randomUUID().toString(); * final String open = uuid + "/"; * final String close = "/" + uuid; * context.set(Parser.class, EmptyParser.INSTANCE); * context.set(EmbeddedDocumentExtractor.class, new EmbedLinker(document, tmp, open, close)); * reader = new EmbeddingHTMLParsingReader(document, open, close, parser, input, metadata, context); * </code> * * @since 1.0.0-beta */ public class EmbeddingHTMLParsingReader extends ParsingReader { private final TokenReplacingReader replacer; public EmbeddingHTMLParsingReader(final Document parent, final String open, final String close, final Parser parser, final TikaInputStream input, final Metadata metadata, final ParseContext context) throws IOException { super(parser, input, metadata, context, (writer)-> new SubstitutingContentHandler(parent, open, close, new ExpandedTitleContentHandler(new HTML5Serializer(writer)))); this.replacer = new TokenReplacingReader((token)-> { final EmbeddedDocument embed = parent.getEmbed(token); if (null == embed) { return null; } return DataURIEncodingInputStream.createReader(embed.getPath(), embed.getMetadata()); }, reader, open, close); } @Override public int read(char[] buffer, int offset, int length) throws IOException { return replacer.read(buffer, offset, length); } @Override public void close() throws IOException { replacer.close(); // Closes the underlying reader. super.close(); } private static class SubstitutingContentHandler extends ContentHandlerDecorator { private boolean isEmbeddedImgTagOpen = false; private boolean isEmbeddedAnchorTagOpen = false; private boolean anchorTagDropped = false; private AttributesImpl imgAttributes = null; private static final String IMG_TAG = "img"; private static final String ANCHOR_TAG = "a"; private final Document parent; private final String open; private final String close; SubstitutingContentHandler(final Document parent, final String open, final String close, final ContentHandler handler) { super(handler); this.open = open; this.close = close; this.parent = parent; } @Override public void startElement(final String uri, final String localName, final String qName, final Attributes atts) throws SAXException { final AttributesImpl attributes = new AttributesImpl(atts); if (IMG_TAG.equalsIgnoreCase(localName) && XHTML.equals(uri) && !isEmbeddedImgTagOpen && !isEmbeddedAnchorTagOpen) { final String src = attributes.getValue("", "src"); if (null != src && src.startsWith("embedded:")) { isEmbeddedImgTagOpen = true; imgAttributes = attributes; } else if (null != src && src.startsWith("cid:")) { super.startElement(uri, localName, qName, atts); } } else if (ANCHOR_TAG.equalsIgnoreCase(localName) && XHTML.equals(uri) && !isEmbeddedAnchorTagOpen) { final String href = attributes.getValue("", "href"); String path = null; if (null != href && href.startsWith(open) && href.endsWith(close)) { path = href.substring(open.length(), href.length() - close.length()); } if (null != path && null != parent.getEmbed(path)) { isEmbeddedAnchorTagOpen = true; // Drop the anchor tag if coming after an embedded image. if (isEmbeddedImgTagOpen) { imgAttributes.setAttribute(imgAttributes.getIndex("", "src"), "", "src", "src", "CDATA", href); super.startElement(uri, IMG_TAG, IMG_TAG, imgAttributes); super.endElement(uri, IMG_TAG, IMG_TAG); isEmbeddedImgTagOpen = false; imgAttributes = null; anchorTagDropped = true; } else { super.startElement(uri, localName, qName, attributes); anchorTagDropped = false; } } else if (isEmbeddedImgTagOpen) { isEmbeddedImgTagOpen = false; imgAttributes = null; super.startElement(uri, localName, qName, attributes); } } else { if (isEmbeddedAnchorTagOpen) { isEmbeddedAnchorTagOpen = false; anchorTagDropped = false; } if (isEmbeddedImgTagOpen) { isEmbeddedImgTagOpen = false; imgAttributes = null; } super.startElement(uri, localName, qName, attributes); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { // Swallow the text in between UUID start and close tags. if (!isEmbeddedAnchorTagOpen || !anchorTagDropped) { super.characters(ch, start, length); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { // Swallow the event if closing an embedded image tag. if (isEmbeddedImgTagOpen && IMG_TAG.equalsIgnoreCase(localName) && XHTML.equals(uri)) { return; } // Error state. Output this event and the previous one. if (isEmbeddedImgTagOpen) { super.startElement(uri, IMG_TAG, IMG_TAG, imgAttributes); isEmbeddedImgTagOpen = false; imgAttributes = null; } if (isEmbeddedAnchorTagOpen && ANCHOR_TAG.equalsIgnoreCase(localName) && XHTML.equals(uri)) { isEmbeddedAnchorTagOpen = false; } else { super.endElement(uri, localName, qName); } } } }