package org.icij.extract.extractor; import org.apache.poi.poifs.filesystem.*; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.icij.extract.document.Document; import org.icij.extract.document.EmbeddedDocument; import org.icij.extract.document.PathIdentifier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import static org.apache.tika.sax.XHTMLContentHandler.XHTML; /** * A custom extractor that saves all embeds to temporary files and records the new paths. It does this * non-recursively, so only embedded documents one level deep are linked. * * Ideally {@link #parseEmbedded} would use {@link org.apache.tika.metadata.TikaCoreProperties.EmbeddedResourceType} * but only the PDF parser seems to support it as of Tika 1.8. * * @since 1.0.0-beta */ public class EmbedLinker implements EmbeddedDocumentExtractor { /** * Logger for logging exceptions. */ private static final Logger logger = LoggerFactory.getLogger(EmbedLinker.class); private final Document parent; private final TemporaryResources tmp; private final String open; private final String close; private int untitled = 0; EmbedLinker(final Document parent, final TemporaryResources tmp, final String open, final String close) { this.parent = parent; this.tmp = tmp; this.open = open; this.close = close; } /** * Always returns true. Files are not actually parsed. They are exported. * * @param metadata metadata */ @Override public boolean shouldParseEmbedded(final Metadata metadata) { return true; } @Override public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata, final boolean outputHtml) throws SAXException, IOException { String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (null == name || name.isEmpty()) { name = String.format("untitled file %d", ++untitled); } final EmbeddedDocument embed = saveEmbedded(name, input, metadata); // If outputHtml is false then it means that the parser already outputted markup for the embed. if (outputHtml) { final AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "package-entry"); handler.startElement(XHTML, "div", "div", attributes); } final AttributesImpl attributes = new AttributesImpl(); final String type = metadata.get(Metadata.CONTENT_TYPE); final String path = embed.getPath().toString(); attributes.addAttribute("", "href", "href", "CDATA", open + path + close); attributes.addAttribute("", "title", "title", "CDATA", name); attributes.addAttribute("", "download", "download", "CDATA", name); if (null != type) { attributes.addAttribute("", "type", "type", "CDATA", type); } final char[] chars = name.toCharArray(); handler.startElement(XHTML, "a", "a", attributes); handler.characters(chars, 0, chars.length); handler.endElement(XHTML, "a", "a"); if (outputHtml) { handler.endElement(XHTML, "div", "div"); } } private EmbeddedDocument saveEmbedded(final String name, final InputStream input, final Metadata metadata) throws IOException { final Path path = tmp.createTemporaryFile().toPath(); // Add the embedded document to the parent with a key (which is the temporary path) so that it can be looked // up later. final EmbeddedDocument embed = parent.addEmbed(path.toString(), new PathIdentifier(), path, metadata); if ((input instanceof TikaInputStream) && ((TikaInputStream) input).getOpenContainer() != null && ( (TikaInputStream) input).getOpenContainer() instanceof DirectoryEntry) { final POIFSFileSystem fs = new POIFSFileSystem(); saveEntries((DirectoryEntry) ((TikaInputStream) input).getOpenContainer(), fs.getRoot()); try (final OutputStream output = Files.newOutputStream(path)) { fs.writeFilesystem(output); } return embed; } final long copied; try { copied = Files.copy(input, path, StandardCopyOption.REPLACE_EXISTING); } finally { input.close(); } if (copied > 0) { logger.info("Copied {} bytes from embedded document \"{}\" in \"{}\" to file.", copied, name, parent); } else { logger.warn("No bytes copied for embedded document \"{}\" in \"{}\". " + "This could indicate a downstream error.", name, parent); } return embed; } private void saveEntries(final DirectoryEntry source, final DirectoryEntry destination) throws IOException { for (Entry entry : source) { // Recursively save sub-entries. if (entry instanceof DirectoryEntry) { saveEntries((DirectoryEntry) entry, destination.createDirectory(entry.getName())); continue; } // Copy the entry. try (final InputStream contents = new DocumentInputStream((DocumentEntry) entry)) { destination.createDocument(entry.getName(), contents); } catch (IOException e) { logger.error("Unable to save embedded document \"{}\" in document: \"{}\".", entry.getName(), parent, e); } } } }