TikaTextExtractor.java example

Explorer
autopsy-master
/*
 * Autopsy Forensic Browser
 *
 * Copyright 2012-2013 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sleuthkit.autopsy.keywordsearch;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.sleuthkit.autopsy.coreutils.TextUtil;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream;

/**
 * Extractor of text from TIKA supported AbstractFile content. Extracted text is
 * divided into chunks and indexed with Solr. Protects against Tika parser hangs
 * (for unexpected/corrupt content) using a timeout mechanism. If Tika
 * extraction succeeds, chunks are indexed with Solr.
 *
 * This Tika extraction/chunking utility is useful for large files of Tika
 * parsers-supported content type.
 *
 */
class TikaTextExtractor implements TextExtractor {

    private static final Logger logger = Logger.getLogger(TikaTextExtractor.class.getName());
    private static Ingester ingester;
    private static final Charset OUTPUT_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
    private static final int MAX_EXTR_TEXT_CHARS = 512 * 1024;
    private static final int SINGLE_READ_CHARS = 1024;
    private static final int EXTRA_CHARS = 128; //for whitespace
    private final char[] textChunkBuf = new char[MAX_EXTR_TEXT_CHARS];
    private AbstractFile sourceFile; //currently processed file
    private int numChunks = 0;
    private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
    private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();

    TikaTextExtractor() {
        ingester = Server.getIngester();

        Set<MediaType> mediaTypes = new Tika().getParser().getSupportedTypes(new ParseContext());
        for (MediaType mt : mediaTypes) {
            TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
        }
        //logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); //NON-NLS
    }

    @Override
    public boolean setScripts(List<StringExtract.StringExtractUnicodeTable.SCRIPT> extractScripts) {
        return false;
    }

    @Override
    public List<StringExtract.StringExtractUnicodeTable.SCRIPT> getScripts() {
        return null;
    }

    @Override
    public Map<String, String> getOptions() {
        return null;
    }

    @Override
    public void setOptions(Map<String, String> options) {
    }

    @Override
    public int getNumChunks() {
        return numChunks;
    }

    @Override
    public AbstractFile getSourceFile() {
        return sourceFile;
    }

    @Override
    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws Ingester.IngesterException {
        this.sourceFile = sourceFile;
        numChunks = 0; //unknown until indexing is done

        boolean success = false;
        Reader reader = null;
        final InputStream stream = new ReadContentInputStream(sourceFile);
        try {
            Metadata meta = new Metadata();

            //Parse the file in a task
            Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
            ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
            final Future<?> future = tikaParseExecutor.submit(parseTask);
            try {
                future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
            } catch (TimeoutException te) {
                final String msg = NbBundle.getMessage(this.getClass(),
                        "AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
                        sourceFile.getId(), sourceFile.getName());
                KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
                logger.log(Level.WARNING, msg);
                throw new IngesterException(msg);
            } catch (Exception ex) {
                final String msg = NbBundle.getMessage(this.getClass(),
                        "AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
                        sourceFile.getId(), sourceFile.getName());
                KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
                logger.log(Level.WARNING, msg);
                throw new IngesterException(msg);
            }

            // get the reader with the results
            reader = parseTask.getReader();
            if (reader == null) {
                //likely due to exception in parse()
                logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
                return false;
            }

            // break the results into chunks and index
            success = true;
            long readSize;
            long totalRead = 0;
            boolean eof = false;
            //we read max 1024 chars at time, this seems to max what this Reader would return
            while (!eof) {
                if (context.fileIngestIsCancelled()) {
                    ingester.ingest(this);
                    return true;
                }
                readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
                if (readSize == -1) {
                    eof = true;
                } else {
                    totalRead += readSize;
                }
                //consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
                while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
                        && (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
                    totalRead += readSize;
                }
                if (readSize == -1) {
                    //this is the last chunk
                    eof = true;
                } else {
                    //try to read char-by-char until whitespace to not break words
                    while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
                            && !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
                            && (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
                        totalRead += readSize;
                    }
                    if (readSize == -1) {
                        //this is the last chunk
                        eof = true;
                    }
                }

                // Sanitize by replacing non-UTF-8 characters with caret '^'
                for (int i = 0; i < totalRead; ++i) {
                    if (!TextUtil.isValidSolrUTF8(textChunkBuf[i])) {
                        textChunkBuf[i] = '^';
                    }
                }

                StringBuilder sb = new StringBuilder((int) totalRead + 1000);
                sb.append(textChunkBuf, 0, (int) totalRead);

                //reset for next chunk
                totalRead = 0;

                //append meta data if last chunk
                if (eof) {
                    //sort meta data keys
                    List<String> sortedKeyList = Arrays.asList(meta.names());
                    Collections.sort(sortedKeyList);
                    sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
                    for (String key : sortedKeyList) {
                        String value = meta.get(key);
                        sb.append(key).append(": ").append(value).append("\n");
                    }
                }

                // Encode from UTF-8 charset to bytes
                byte[] encodedBytes = sb.toString().getBytes(OUTPUT_CHARSET);
                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
                try {
                    chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
                    ++this.numChunks;
                } catch (Ingester.IngesterException ingEx) {
                    success = false;
                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
                            + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
                    throw ingEx; //need to rethrow/return to signal error and move on
                }
            }
        } catch (IOException ex) {
            final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
            KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
            logger.log(Level.WARNING, msg);
            success = false;
        } catch (Exception ex) {
            final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
            KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
            logger.log(Level.WARNING, msg);
            success = false;
        } finally {
            try {
                stream.close();
            } catch (IOException ex) {
                logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
            }
            try {
                if (reader != null) {
                    reader.close();
                }
            } catch (IOException ex) {
                logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
            }
        }

        //after all chunks, ingest the parent file without content itself, and store numChunks
        ingester.ingest(this);

        return success;
    }

    @Override
    public boolean isContentTypeSpecific() {
        return true;
    }

    @Override
    public boolean isSupported(AbstractFile file, String detectedFormat) {
        if (detectedFormat == null) {
            return false;
        } else if (detectedFormat.equals("application/octet-stream") //NON-NLS
                || detectedFormat.equals("application/x-msdownload")) { //NON-NLS
            //any binary unstructured blobs (string extraction will be used)
            return false;
        } else if (TextExtractor.ARCHIVE_MIME_TYPES.contains(detectedFormat)) {
            return false;
        } //skip video other than flv (tika supports flv only)
        else if (detectedFormat.contains("video/") //NON-NLS
                && !detectedFormat.equals("video/x-flv")) { //NON-NLS
            return false;
        } else if (detectedFormat.contains("application/x-font-ttf")) { //NON-NLS
            // Tika currently has a bug in the ttf parser in fontbox.
            // It will throw an out of memory exception
            return false;
        }

        //TODO might need to add more mime-types to ignore
        //then accept all formats supported by Tika
        return TIKA_SUPPORTED_TYPES.contains(detectedFormat);

    }

    /**
     * Runnable task that calls tika to parse the content using the input
     * stream. Provides reader for results.
     */
    private static class ParseRequestTask implements Runnable {

        //in
        private Tika tika;
        private InputStream stream;
        private Metadata meta;
        private AbstractFile sourceFile;
        //out
        private Reader reader;

        ParseRequestTask(Tika tika, InputStream stream, Metadata meta, AbstractFile sourceFile) {
            this.tika = tika;
            this.stream = stream;
            this.meta = meta;
            this.sourceFile = sourceFile;
        }

        @Override
        public void run() {
            try {
                reader = tika.parse(stream, meta);
            } catch (IOException ex) {
                KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
                tika = null;
                reader = null;
            } catch (Exception ex) {
                KeywordSearch.getTikaLogger().log(Level.WARNING, "Exception: Unable to Tika parse the content" + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
                tika = null;
                reader = null;
            }
        }

        public Reader getReader() {
            return reader;
        }
    }
}