/** * * Copyright 2014 The MITRE Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * * ************************************************************************** * NOTICE This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2014 The MITRE Corporation. All Rights Reserved. * ************************************************************************** */ package org.opensextant.xtext.converters; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import org.apache.tika.config.TikaConfig; import org.apache.tika.extractor.EmbeddedResourceHandler; import org.apache.tika.extractor.ParserContainerExtractor; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.parser.ParseContext; import org.opensextant.xtext.Content; import org.opensextant.xtext.ConvertedDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class EmbeddedContentConverter extends DefaultConverter { ParseContext context = new ParseContext(); Logger log = LoggerFactory.getLogger(getClass()); private final static Set<String> supportedTypes = new HashSet<String>(); static { supportedTypes.add("pptx"); supportedTypes.add("ppt"); supportedTypes.add("docx"); supportedTypes.add("doc"); supportedTypes.add("pdf"); // Tika 1.5 test case only - // supportedTypes.add("xls"); } public EmbeddedContentConverter() { super(); } public EmbeddedContentConverter(int sz) { super(sz); } /** * If file type is NOT supported, the ConvertedDocument from the DefaultConverter will be returned. * if the file type is supported, the ConvertedDocument from the default is used as the parent to * organize the embedded items found within. Embedded items are organized on disk with metadata. * * Supported = MS PPT/PPTX, DOC/DOCX, PDF * @param fileext ext * @return true if file extension is supported. */ public static boolean isSupported(String fileext) { if (StringUtils.isBlank(fileext)) { return false; } return supportedTypes.contains(fileext.toLowerCase()); } /** * Convert Embedded documents in the supported types to a folder of the embedded items. * Trivial embedded icons and other components will not be extracted * */ @Override protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException { ConvertedDocument compoundDoc = super.conversionImplementation(in, doc); String ext = FilenameUtils.getExtension(doc.getName()); if (!isSupported(ext)) { // We know we don't support textifying compound docs of this type. DONE! // return compoundDoc; // Not really compound by our standards here. } ParserContainerExtractor extractor = new ParserContainerExtractor(); EmbeddedObjectExtractor objExtractor = new EmbeddedObjectExtractor(compoundDoc, true); TikaInputStream tikaStream = null; try { tikaStream = TikaInputStream.get(doc.toPath()); extractor.extract(tikaStream, extractor, objExtractor); compoundDoc.is_converted = true; if (compoundDoc.hasRawChildren()) { // Create text buffer for this compound document here. // If raw children should be post-processed by some other means, that is up to caller. // This parent document at least contains a complete text representation of the content in the original doc. StringBuilder completeText = new StringBuilder(); completeText.append(compoundDoc.getText()); completeText.append("\n==Embedded Objects==\n"); completeText.append(renderText(compoundDoc.getRawChildren())); compoundDoc.setText(completeText.toString()); compoundDoc.is_converted = true; return compoundDoc; } else { // Okay, the complicated Embedded doc approach did not yied anything. // Try the simple approach. return compoundDoc; } } catch (Exception e) { throw new IOException("Stream parsing problem", e); } finally { tikaStream.close(); } } private final DefaultConverter conv = new DefaultConverter(); /** * * @param childObjects children * @return text assembled from children */ private String renderText(List<Content> childObjects) { StringBuilder buf = new StringBuilder(); for (Content c : childObjects) { buf.append(String.format("\n[Embedded: %s; %s]\n", c.id, c.tikaMediatype.toString())); try { // NOTE: To do this well, you may have to write bytes to disk as a valid file name // And let Tika convert in full. ConvertedDocument text = conv.conversionImplementation( TikaInputStream.get(c.content, c.tikaMetadata), null); buf.append(text.getText()); } catch (IOException ioe) { buf.append("Unconvertable content"); } buf.append("\n"); } return buf.toString(); } private final static Set<String> filterableMeta = new HashSet<String>(); static { filterableMeta.add("application/x-emf"); filterableMeta.add("application/x-msmetafile"); // PNG is not trivial item, as are icons or EMF stuff. // filter out particular mime-types by situation in resource handler EmbeddedObjectExtractor // filterableMeta.add("image/png"); } /** * Embedded extractor here saves embedded objects to folder structure. * * @author ubaldino * */ class EmbeddedObjectExtractor implements EmbeddedResourceHandler { ConvertedDocument parent = null; int objectCount = 0; boolean filterOut = true; EmbeddedObjectExtractor(ConvertedDocument par, boolean filterTrivia) throws IOException { parent = par; filterOut = filterTrivia; } /** * Certain items are trivial. * * @param mediaType media/MIME type * @return true if object type should be filtered */ public boolean filterOutTrivialObjects(String mediaType) { if (filterableMeta.contains(mediaType)) { return true; } if (filterOut) { if ("image/png".equalsIgnoreCase(mediaType)) { return true; } } return false; } /** * EmbeddedResourceHandler interface; listen for objects and handle them as needed. */ @Override public void handle(String filename, MediaType mediaType, InputStream stream) { Metadata md = new Metadata(); ++objectCount; String ext = "dat"; if (filterOutTrivialObjects(mediaType.toString())) { log.debug("Filtering out object " + mediaType); return; } MimeType mimeType = null; try { mimeType = TikaConfig.getDefaultConfig().getMimeRepository() .getRegisteredMimeType(mediaType.toString()); ext = mimeType.getExtension(); log.debug("Embedded object type={}", mimeType); if (StringUtils.isBlank(ext)) { ext = "dat"; } else { ext = ext.replace(".", ""); } } catch (MimeTypeException e1) { log.error("Tika could not find a file type for " + mediaType, e1); } boolean has_fname = true; if (filename == null) { filename = String.format("%s,Part%d.%s", parent.basename, objectCount, ext); has_fname = false; } else if (filename.length() < 3) { filename = String.format("%s,Part_%s_%d.%s", parent.basename, filename, objectCount, ext); } if (filename.contains("/")) { filename = filename.replace("/", "_"); } log.debug("Embbedded object file={} has filename? {}", filename, has_fname); md.add(Metadata.RESOURCE_NAME_KEY, filename); Content child = new Content(); child.id = filename; child.meta.setProperty(ConvertedDocument.CHILD_ENTRY_KEY, filename); if (mimeType != null) { child.mimeType = mimeType.toString(); } // NOTE: this is redundant here; as we just created tika Metadata() object ourselves. child.tikaMetadata = md; child.tikaMediatype = mediaType; try { child.content = IOUtils.toByteArray(stream); } catch (IOException e1) { log.error("Embedded object IO error", e1); } if (child.content.length > 0) { parent.addRawChild(child); } } } }