/* * Autopsy Forensic Browser * * Copyright 2012 Basis Technology Corp. * Contact: carrier <at> sleuthkit <dot> org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sleuthkit.autopsy.keywordsearch; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.TskException; /** * AbstractFile input string stream reader/converter - given AbstractFile, * extract strings from it and return encoded bytes via read() * * Note: the utility supports extraction of only LATIN script and UTF8, UTF16LE, * UTF16BE encodings and uses a brute force encoding detection - it's fast but * could apply multiple encodings on the same string. * * For other script/languages support and better encoding detection use * AbstractFileStringIntStream streaming class, which wraps around StringExtract * extractor. */ class AbstractFileStringStream extends InputStream { //args private AbstractFile content; private Charset outputCharset; //internal data private static final Logger logger = Logger.getLogger(AbstractFileStringStream.class.getName()); private static final String NLS = Character.toString((char) 10); //new line private static final int READ_BUF_SIZE = 256; private long contentOffset = 0; //offset in fscontent read into curReadBuf private final byte[] curReadBuf = new byte[READ_BUF_SIZE]; private int bytesInReadBuf = 0; private int readBufOffset = 0; //offset in read buf processed private StringBuilder curString = new StringBuilder(); private int curStringLen = 0; private StringBuilder tempString = new StringBuilder(); private int tempStringLen = 0; private boolean isEOF = false; private boolean stringAtTempBoundary = false; //if temp has part of string that didn't make it in previous read() private boolean stringAtBufBoundary = false; //if read buffer has string being processed, continue as string from prev read() in next read() private boolean inString = false; //if current temp has min chars required private final byte[] oneCharBuf = new byte[1]; private final int MIN_PRINTABLE_CHARS = 4; //num. of chars needed to qualify as a char string /** * Construct new string stream from FsContent * * @param content to extract strings from * @param outputCharset target encoding to index as * @param preserveOnBuffBoundary whether to preserve or split string on a * buffer boundary. If false, will pack into * read buffer up to max. possible, * potentially splitting a string. If false, * the string will be preserved for next read. */ public AbstractFileStringStream(AbstractFile content, Charset outputCharset, boolean preserveOnBuffBoundary) { this.content = content; this.outputCharset = outputCharset; //this.preserveOnBuffBoundary = preserveOnBuffBoundary; //logger.log(Level.INFO, "FILE: " + content.getParentPath() + "/" + content.getName()); } /** * Construct new string stream from FsContent Do not attempt to fill entire * read buffer if that would break a string * * @param content to extract strings from * @param outCharset target charset to encode into bytes and index as, e.g. * UTF-8 */ public AbstractFileStringStream(AbstractFile content, Charset outCharset) { this(content, outCharset, false); } @Override public int read(byte[] b, int off, int len) throws IOException { if (b == null) { throw new NullPointerException(); } else if (off < 0 || len < 0 || len > b.length - off) { throw new IndexOutOfBoundsException(); } else if (len == 0) { return 0; } long fileSize = content.getSize(); if (fileSize == 0) { return -1; } if (isEOF) { return -1; } if (stringAtTempBoundary) { //append entire temp string residual from previous read() //because qualified string was broken down into 2 parts appendResetTemp(); stringAtTempBoundary = false; //there could be more to this string in fscontent/buffer } boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char int newCurLen = curStringLen + tempStringLen; while (newCurLen < len) { //need to extract more strings if (readBufOffset > bytesInReadBuf - 1) { //no more bytes to process into strings, read them try { bytesInReadBuf = 0; bytesInReadBuf = content.read(curReadBuf, contentOffset, READ_BUF_SIZE); } catch (TskException ex) { if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) { appendResetTemp(); //have some extracted string, return that, and fail next time isEOF = true; int copied = copyToReturn(b, off, len); return copied; } else { return -1; //EOF } } if (bytesInReadBuf < 1) { if (curStringLen > 0 || tempStringLen >= MIN_PRINTABLE_CHARS) { appendResetTemp(); //have some extracted string, return that, and fail next time isEOF = true; int copied = copyToReturn(b, off, len); return copied; } else { return -1; //EOF } } //increment content offset for next read contentOffset += bytesInReadBuf; //reset read buf position readBufOffset = 0; } //get char from cur read buf char c = (char) curReadBuf[readBufOffset++]; if (c == 0 && singleConsecZero == false) { //preserve the current sequence if max consec. 1 zero char singleConsecZero = true; } else { singleConsecZero = false; } if (StringExtract.isPrintableAscii(c)) { tempString.append(c); ++tempStringLen; if (tempStringLen >= MIN_PRINTABLE_CHARS) { inString = true; } //boundary case when temp has still chars - handled after the loop } else if (!singleConsecZero) { //break the string, clear temp if (tempStringLen >= MIN_PRINTABLE_CHARS || stringAtBufBoundary) { //append entire temp string with new line tempString.append(NLS); ++tempStringLen; curString.append(tempString); curStringLen += tempStringLen; stringAtBufBoundary = false; } //reset temp tempString = new StringBuilder(); tempStringLen = 0; } newCurLen = curStringLen + tempStringLen; } //check if still in string state, so that next chars in read buf bypass min chars check //and qualify as string even if less < min chars required if (inString) { inString = false; //reset stringAtBufBoundary = true; //will bypass the check } //check if temp still has chars to qualify as a string //we might need to break up temp into 2 parts for next read() call //consume as many as possible to fill entire user buffer if (tempStringLen >= MIN_PRINTABLE_CHARS) { if (newCurLen > len) { int appendChars = len - curStringLen; //save part for next user read(), need to break up temp string //do not append new line String toAppend = tempString.substring(0, appendChars); String newTemp = tempString.substring(appendChars); curString.append(toAppend); curStringLen += appendChars; tempString = new StringBuilder(newTemp); tempStringLen = newTemp.length(); stringAtTempBoundary = true; } else { //append entire temp curString.append(tempString); curStringLen += tempStringLen; //reset temp tempString = new StringBuilder(); tempStringLen = 0; } } else { //if temp has a few chars, not qualified as string for now, //will be processed during next read() call } //copy current strings to user final int copied = copyToReturn(b, off, len); //there may be still chars in read buffer or tempString, for next read() return copied; } //append temp buffer to cur string buffer and reset temp, if enough chars //does not append new line private void appendResetTemp() { if (tempStringLen >= MIN_PRINTABLE_CHARS) { curString.append(tempString); curStringLen += tempStringLen; tempString = new StringBuilder(); tempStringLen = 0; } } //copy currently extracted string to user buffer //and reset for next read() call private int copyToReturn(byte[] b, int off, long len) { final String curStringS = curString.toString(); //logger.log(Level.INFO, curStringS); byte[] stringBytes = curStringS.getBytes(outputCharset); System.arraycopy(stringBytes, 0, b, off, Math.min(curStringLen, (int) len)); //logger.log(Level.INFO, curStringS); //copied all string, reset curString = new StringBuilder(); int ret = curStringLen; curStringLen = 0; return ret; } @Override public int read() throws IOException { final int read = read(oneCharBuf, 0, 1); if (read == 1) { return oneCharBuf[0]; } else { return -1; } } @Override public int available() throws IOException { //we don't know how many bytes in curReadBuf may end up as strings return 0; } @Override public long skip(long n) throws IOException { //use default implementation that reads into skip buffer //but it could be more efficient return super.skip(n); } }