StringsTextExtractor.java example

Explorer
autopsy-master
/*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2014 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sleuthkit.autopsy.keywordsearch;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.AbstractFile;

/**
 * Takes an AbstractFile, extract strings, converts into chunks (associated with
 * the original source file) up to 1MB then and indexes chunks as text with Solr
 */
class StringsTextExtractor implements TextExtractor {

    private static Ingester ingester;
    private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName());
    private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L;
    //private static final int BOM_LEN = 3; 
    private static final int BOM_LEN = 0;  //disabled prepending of BOM
    private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET;
    private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2;
    private AbstractFile sourceFile;
    private int numChunks = 0;
    private final List<SCRIPT> extractScripts = new ArrayList<>();
    private Map<String, String> extractOptions = new HashMap<>();

    //disabled prepending of BOM
    //static {
    //prepend UTF-8 BOM to start of the buffer
    //stringChunkBuf[0] = (byte) 0xEF;
    //stringChunkBuf[1] = (byte) 0xBB;
    //stringChunkBuf[2] = (byte) 0xBF;
    //}
    public StringsTextExtractor() {
        ingester = Server.getIngester();
        extractScripts.add(DEFAULT_SCRIPT);
    }

    @Override
    public boolean setScripts(List<SCRIPT> extractScripts) {
        this.extractScripts.clear();
        this.extractScripts.addAll(extractScripts);
        return true;
    }

    @Override
    public List<SCRIPT> getScripts() {
        return new ArrayList<>(extractScripts);
    }

    @Override
    public int getNumChunks() {
        return this.numChunks;
    }

    @Override
    public AbstractFile getSourceFile() {
        return sourceFile;
    }

    @Override
    public Map<String, String> getOptions() {
        return extractOptions;
    }

    @Override
    public void setOptions(Map<String, String> options) {
        this.extractOptions = options;
    }

    @Override
    public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException {
        this.sourceFile = sourceFile;
        this.numChunks = 0; //unknown until indexing is done
        boolean success = false;

        final boolean extractUTF8
                = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString()));

        final boolean extractUTF16
                = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString()));

        if (extractUTF8 == false && extractUTF16 == false) {
            //nothing to do
            return true;
        }

        InputStream stringStream;
        //check which extract stream to use
        if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) {
            //optimal for english, english only
            stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET);
        } else {
            stringStream = new AbstractFileStringIntStream(
                    sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET);
        }

        try {
            success = true;
            //break input stream into chunks 

            final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE];
            long readSize;
            while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) {
                if (context.fileIngestIsCancelled()) {
                    ingester.ingest(this);
                    return true;
                }
                //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1));
                //debug.write(stringChunkBuf, 0, (int)readSize);

                AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);

                try {
                    chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET);
                    ++this.numChunks;
                } catch (IngesterException ingEx) {
                    success = false;
                    logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
                    throw ingEx; //need to rethrow/return to signal error and move on
                }

                //debug.close();    
            }

            //after all chunks, ingest the parent file without content itself, and store numChunks
            ingester.ingest(this);

        } catch (IOException ex) {
            logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS
            success = false;
        } finally {
            try {
                stringStream.close();
            } catch (IOException ex) {
                logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS
            }
        }

        return success;
    }

    @Override
    public boolean isContentTypeSpecific() {
        return true;
    }

    @Override
    public boolean isSupported(AbstractFile file, String detectedFormat) {
        // strings can be run on anything. 
        return true;
    }
}