package org.icij.extract.extractor; import org.apache.poi.poifs.filesystem.*; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.utils.ExceptionUtils; import org.icij.extract.document.Document; import org.icij.extract.document.EmbeddedDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import java.io.*; import java.nio.charset.StandardCharsets; import java.nio.file.FileAlreadyExistsException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.LinkedList; import java.util.function.Function; public class EmbedSpawner extends EmbedParser { private static final Logger logger = LoggerFactory.getLogger(EmbedParser.class); private final TemporaryResources tmp; private final Path output; private ContentHandler originalHandler = null; private final Function<Writer, ContentHandler> handlerFunction; private LinkedList<Document> stack = new LinkedList<>(); private int untitled = 0; EmbedSpawner(final Document root, final TemporaryResources tmp, final ParseContext context, final Path output, final Function<Writer, ContentHandler> handlerFunction) { super(root, context); this.tmp = tmp; this.output = output; this.handlerFunction = handlerFunction; stack.add(root); } @Override public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata, final boolean outputHtml) throws SAXException, IOException { // Need to keep track of and use the original handler, since a modified one is passed to the parser. if (null == originalHandler) { originalHandler = handler; } // Use a different handler for receiving SAX events from the embedded document. The allows the main content // handler that receives the entire concatenated content to receive only the body of the embed, while the // handler that writes to the temporary file will receive the entire document. final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(originalHandler)); if (outputHtml) { writeStart(originalHandler, metadata); } // There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main // document as usual. if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata .get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) { delegateParsing(input, embedHandler, metadata); } else { spawnEmbedded(input, embedHandler, metadata); } if (outputHtml) { writeEnd(originalHandler); } } private void spawnEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata) throws IOException { final Path parsedOutputPath = tmp.createTempFile(); String name = metadata.get(Metadata.RESOURCE_NAME_KEY); final Writer writer = Files.newBufferedWriter(parsedOutputPath, StandardCharsets.UTF_8); final ContentHandler teeHandler = new TeeContentHandler(handler, handlerFunction.apply(writer)); final EmbeddedDocument embed = stack.getLast().addEmbed(metadata); stack.add(embed); // Use temporary resources to copy formatted output from the content handler to a temporary file. // Call setReader on the embed object with a plain reader for this temp file. // When all parsing finishes, close temporary resources. // Note that getPath should still return the path to the original file. embed.setReader(() -> Files.newBufferedReader(parsedOutputPath, StandardCharsets.UTF_8)); if (null == name || name.isEmpty()) { name = String.format("untitled_%d", ++untitled); } // Trigger spooling of the file to disk so that it can be copied. // This needs to be done before parsing starts and the same TIS object must be passed to writeEmbed, // otherwise it will be spooled twice. final TikaInputStream tis = TikaInputStream.get(input, tmp); if (null != output) { tis.getPath(); } try { // Pass the same TIS, otherwise the EmbedParser will attempt to spool the input again and fail, because it's // already been consumed. delegateParsing(tis, teeHandler, metadata); } catch (Exception e) { // Note that even on exception, the document is intentionally NOT removed from the parent. logger.error("Unable to parse embedded document: \"{}\" (in \"{}\").", name, root, e); // TODO: Change to TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM in Tika 1.15. metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getFilteredStackTrace(e)); } finally { stack.removeLast(); writer.close(); } // Write the embed file to the given output directory. if (null != output) { writeEmbed(tis, embed, name); } } private void writeEmbed(final TikaInputStream tis, final EmbeddedDocument embed, final String name) throws IOException { final Path source; final Metadata metadata = embed.getMetadata(); final Object container = tis.getOpenContainer(); // If the input is a container, write it to a temporary file so that it can then be copied atomically. // This happens with, for example, an Outlook Message that is an attachment of another Outlook Message. if (container instanceof DirectoryEntry) { final POIFSFileSystem fs = new POIFSFileSystem(); source = tmp.createTempFile(); saveEntries((DirectoryEntry) container, fs.getRoot()); try (final OutputStream output = Files.newOutputStream(source)) { fs.writeFilesystem(output); } } else { source = tis.getPath(); } // Set the content-length as it isn't (always?) set by Tika for embeds. if (null == metadata.get(Metadata.CONTENT_LENGTH)) { metadata.set(Metadata.CONTENT_LENGTH, Long.toString(Files.size(source))); } // To prevent massive duplication and because the disk is only a storage for underlying date, save using the // straight hash as a filename. try (final OutputStream copy = Files.newOutputStream(output.resolve(embed.getHash()), StandardOpenOption.CREATE_NEW)) { Files.copy(source, copy); } catch (FileAlreadyExistsException e) { logger.info("Temporary file for document \"{}\" in \"{}\" already exists.", name, root); } } private void saveEntries(final DirectoryEntry source, final DirectoryEntry destination) throws IOException { for (Entry entry : source) { // Recursively save sub-entries or copy the entry. if (entry instanceof DirectoryEntry) { saveEntries((DirectoryEntry) entry, destination.createDirectory(entry.getName())); } else { try (final InputStream contents = new DocumentInputStream((DocumentEntry) entry)) { destination.createDocument(entry.getName(), contents); } } } } }