package org.icij.extract.extractor;
import java.io.*;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.DelegatingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.icij.extract.document.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.SAXException;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
/**
* A custom extractor that is an almost exact copy of Tika's default extractor for embedded documents.
*
* Logs errors that Tika's default extractor otherwise swallows, but doesn't throw them, allowing parsing to continue.
*
* @since 1.0.0-beta
*/
public class EmbedParser extends ParsingEmbeddedDocumentExtractor {
private static final Logger logger = LoggerFactory.getLogger(EmbedParser.class);
private static final Parser DELEGATING_PARSER = new DelegatingParser();
final Document root;
protected final ParseContext context;
EmbedParser(final Document root, final ParseContext context) {
super(context);
this.root = root;
this.context = context;
}
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
final boolean outputHtml) throws SAXException, IOException {
if (outputHtml) {
writeStart(handler, metadata);
}
delegateParsing(input, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata);
if (outputHtml) {
writeEnd(handler);
}
}
void delegateParsing(final InputStream input, final ContentHandler handler, final Metadata metadata)
throws IOException, SAXException {
try (final TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(input), tmp);
if (input instanceof TikaInputStream) {
final Object container = ((TikaInputStream) input).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
// Use the delegate parser to parse this entry.
DELEGATING_PARSER.parse(newStream, handler, metadata, context);
} catch (EncryptedDocumentException e) {
logger.error("Encrypted document embedded in document: \"{}\" (in \"{}\").",
metadata.get(Metadata.RESOURCE_NAME_KEY), root, e);
} catch (TikaException e) {
logger.error("Unable to parse embedded document: \"{}\" (in \"{}\").",
metadata.get(Metadata.RESOURCE_NAME_KEY), root, e);
}
}
void writeStart(final ContentHandler handler, final Metadata metadata) throws SAXException {
final AttributesImpl attributes = new AttributesImpl();
final String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
if (name != null && name.length() > 0) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
}
void writeEnd(final ContentHandler handler) throws SAXException {
handler.endElement(XHTML, "div", "div");
}
}