/* * Autopsy Forensic Browser * * Copyright 2012-2013 Basis Technology Corp. * Contact: carrier <at> sleuthkit <dot> org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import org.sleuthkit.autopsy.coreutils.TextUtil; import java.util.concurrent.TimeoutException; import java.util.logging.Level; import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; /** * Extractor of text from TIKA supported AbstractFile content. Extracted text is * divided into chunks and indexed with Solr. Protects against Tika parser hangs * (for unexpected/corrupt content) using a timeout mechanism. If Tika * extraction succeeds, chunks are indexed with Solr. * * This Tika extraction/chunking utility is useful for large files of Tika * parsers-supported content type. * */ class TikaTextExtractor implements TextExtractor { private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName()); private static Ingester ingester; private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024; private static final int SINGLE_READ_CHARS = 1024; private static final int EXTRA_CHARS = 128; //for whitespace private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS]; private AbstractFile sourceFile; //currently processed file private int numChunks = 0; private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>(); TikaTextExtractor() { ingester = Server.getIngester(); Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext()); for (MediaType mt : mediaTypes) { TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype()); } //logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS } @Override public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) { return false; } @Override public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() { return null; } @Override public Map<String, String> getOptions() { return null; } @Override public void setOptions(Map<String, String> options) { } @Override public int getNumChunks() { return numChunks; } @Override public AbstractFile getSourceFile() { return sourceFile; } @Override public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException { this.sourceFile = sourceFile; numChunks = 0; //unknown until indexing is done boolean success = false; Reader reader = null; final InputStream stream = new ReadContentInputStream(sourceFile); try { Metadata meta = new Metadata(); //Parse the file in a task Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile); final Future<?> future = tikaParseExecutor.submit(parseTask); try { future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); } catch (TimeoutException te) { final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.tikaParseTimeout.text", sourceFile.getId(), sourceFile.getName()); KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te); logger.log(Level.WARNING, msg); throw new IngesterException(msg); } catch (Exception ex) { final String msg = NbBundle.getMessage(this.getClass(), "AbstractFileTikaTextExtract.index.exception.tikaParse.msg", sourceFile.getId(), sourceFile.getName()); KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex); logger.log(Level.WARNING, msg); throw new IngesterException(msg); } // get the reader with the results reader = parseTask.getReader(); if (reader == null) { //likely due to exception in parse() logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS return false; } // break the results into chunks and index success = true; long readSize; long totalRead = 0; boolean eof = false; //we read max 1024 chars at time, this seems to max what this Reader would return while (!eof) { if (context.fileIngestIsCancelled()) { ingester.ingest(this); return true; } readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS); if (readSize == -1) { eof = true; } else { totalRead += readSize; } //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word) while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS) && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) { totalRead += readSize; } if (readSize == -1) { //this is the last chunk eof = true; } else { //try to read char-by-char until whitespace to not break words while ((totalRead < MAX_EXTR_TEXT_CHARS - 1) && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1]) && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) { totalRead += readSize; } if (readSize == -1) { //this is the last chunk eof = true; } } // Sanitize by replacing non-UTF-8 characters with caret '^' for (int i = 0; i < totalRead; ++i) { if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) { textChunkBuf[i] = '^'; } } StringBuilder sb = new StringBuilder((int) totalRead + 1000); sb.append(textChunkBuf, 0, (int) totalRead); //reset for next chunk totalRead = 0; //append meta data if last chunk if (eof) { //sort meta data keys List<String> sortedKeyList = Arrays.asList(meta.names()); Collections.sort(sortedKeyList); sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS for (String key : sortedKeyList) { String value = meta.get(key); sb.append(key).append(": ").append(value).append("\n"); } } // Encode from UTF-8 charset to bytes byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); try { chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET); ++this.numChunks; } catch (Ingester.IngesterException ingEx) { success = false; logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS throw ingEx; //need to rethrow/return to signal error and move on } } } catch (IOException ex) { final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex); logger.log(Level.WARNING, msg); success = false; } catch (Exception ex) { final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex); logger.log(Level.WARNING, msg); success = false; } finally { try { stream.close(); } catch (IOException ex) { logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS } try { if (reader != null) { reader.close(); } } catch (IOException ex) { logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS } } //after all chunks, ingest the parent file without content itself, and store numChunks ingester.ingest(this); return success; } @Override public boolean isContentTypeSpecific() { return true; } @Override public boolean isSupported(AbstractFile file, String detectedFormat) { if (detectedFormat == null) { return false; } else if (detectedFormat.equals("application/octet-stream") //NON-NLS || detectedFormat.equals("application/x-msdownload")) { //NON-NLS //any binary unstructured blobs (string extraction will be used) return false; } else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) { return false; } //skip video other than flv (tika supports flv only) else if (detectedFormat.contains("video/") //NON-NLS && !detectedFormat.equals("video/x-flv")) { //NON-NLS return false; } else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS // Tika currently has a bug in the ttf parser in fontbox. // It will throw an out of memory exception return false; } //TODO might need to add more mime-types to ignore //then accept all formats supported by Tika return TIKA_SUPPORTED_TYPES.contains(detectedFormat); } /** * Runnable task that calls tika to parse the content using the input * stream. Provides reader for results. */ private static class ParseRequestTask implements Runnable { //in private Tika tika; private InputStream stream; private Metadata meta; private AbstractFile sourceFile; //out private Reader reader; ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) { this.tika = tika; this.stream = stream; this.meta = meta; this.sourceFile = sourceFile; } @Override public void run() { try { reader = tika.parse(stream, meta); } catch (IOException ex) { KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS tika = null; reader = null; } catch (Exception ex) { KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS tika = null; reader = null; } } public Reader getReader() { return reader; } } }