/* * Autopsy Forensic Browser * * Copyright 2011-2014 Basis Technology Corp. * Contact: carrier <at> sleuthkit <dot> org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT; import org.sleuthkit.autopsy.ingest.IngestJobContext; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.datamodel.AbstractFile; /** * Takes an AbstractFile, extract strings, converts into chunks (associated with * the original source file) up to 1MB then and indexes chunks as text with Solr */ class StringsTextExtractor implements TextExtractor { private static Ingester ingester; private static final Logger logger = Logger.getLogger(StringsTextExtractor.class.getName()); private static final long MAX_STRING_CHUNK_SIZE = 1 * 1024 * 1024L; //private static final int BOM_LEN = 3; private static final int BOM_LEN = 0; //disabled prepending of BOM private static final Charset INDEX_CHARSET = Server.DEFAULT_INDEXED_TEXT_CHARSET; private static final SCRIPT DEFAULT_SCRIPT = SCRIPT.LATIN_2; private AbstractFile sourceFile; private int numChunks = 0; private final List<SCRIPT> extractScripts = new ArrayList<>(); private Map<String, String> extractOptions = new HashMap<>(); //disabled prepending of BOM //static { //prepend UTF-8 BOM to start of the buffer //stringChunkBuf[0] = (byte) 0xEF; //stringChunkBuf[1] = (byte) 0xBB; //stringChunkBuf[2] = (byte) 0xBF; //} public StringsTextExtractor() { ingester = Server.getIngester(); extractScripts.add(DEFAULT_SCRIPT); } @Override public boolean setScripts(List<SCRIPT> extractScripts) { this.extractScripts.clear(); this.extractScripts.addAll(extractScripts); return true; } @Override public List<SCRIPT> getScripts() { return new ArrayList<>(extractScripts); } @Override public int getNumChunks() { return this.numChunks; } @Override public AbstractFile getSourceFile() { return sourceFile; } @Override public Map<String, String> getOptions() { return extractOptions; } @Override public void setOptions(Map<String, String> options) { this.extractOptions = options; } @Override public boolean index(AbstractFile sourceFile, IngestJobContext context) throws IngesterException { this.sourceFile = sourceFile; this.numChunks = 0; //unknown until indexing is done boolean success = false; final boolean extractUTF8 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF8.toString())); final boolean extractUTF16 = Boolean.parseBoolean(extractOptions.get(TextExtractor.ExtractOptions.EXTRACT_UTF16.toString())); if (extractUTF8 == false && extractUTF16 == false) { //nothing to do return true; } InputStream stringStream; //check which extract stream to use if (extractScripts.size() == 1 && extractScripts.get(0).equals(SCRIPT.LATIN_1)) { //optimal for english, english only stringStream = new AbstractFileStringStream(sourceFile, INDEX_CHARSET); } else { stringStream = new AbstractFileStringIntStream( sourceFile, extractScripts, extractUTF8, extractUTF16, INDEX_CHARSET); } try { success = true; //break input stream into chunks final byte[] stringChunkBuf = new byte[(int) MAX_STRING_CHUNK_SIZE]; long readSize; while ((readSize = stringStream.read(stringChunkBuf, BOM_LEN, (int) MAX_STRING_CHUNK_SIZE - BOM_LEN)) != -1) { if (context.fileIngestIsCancelled()) { ingester.ingest(this); return true; } //FileOutputStream debug = new FileOutputStream("c:\\temp\\" + sourceFile.getName() + Integer.toString(this.numChunks+1)); //debug.write(stringChunkBuf, 0, (int)readSize); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); try { chunk.index(ingester, stringChunkBuf, readSize + BOM_LEN, INDEX_CHARSET); ++this.numChunks; } catch (IngesterException ingEx) { success = false; logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" + sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS throw ingEx; //need to rethrow/return to signal error and move on } //debug.close(); } //after all chunks, ingest the parent file without content itself, and store numChunks ingester.ingest(this); } catch (IOException ex) { logger.log(Level.WARNING, "Unable to read input stream to divide and send to Solr, file: " + sourceFile.getName(), ex); //NON-NLS success = false; } finally { try { stringStream.close(); } catch (IOException ex) { logger.log(Level.WARNING, "Error closing input stream stream, file: " + sourceFile.getName(), ex); //NON-NLS } } return success; } @Override public boolean isContentTypeSpecific() { return true; } @Override public boolean isSupported(AbstractFile file, String detectedFormat) { // strings can be run on anything. return true; } }