/*
* Autopsy Forensic Browser
*
* Copyright 2012 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.List;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractResult;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskCoreException;
/**
* Wrapper over StringExtract to provide streaming API Given AbstractFile
* object, extract international strings from the file and read output as a
* stream of UTF-8 strings as encoded bytes.
*
*/
class AbstractFileStringIntStream extends InputStream {
private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
private static final int FILE_BUF_SIZE = 1024 * 1024;
private AbstractFile content;
private final byte[] oneCharBuf = new byte[1];
private final StringExtract stringExtractor;
private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
private long fileReadOffset = 0L;
private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
private int convertBuffOffset = 0; //offset to start returning data to user on next read()
private int bytesInConvertBuff = 0; //amount of data currently in the buffer
private boolean fileEOF = false; //if file has more bytes to read
private boolean extractUTF8;
private boolean extractUTF16;
private Charset outCharset;
private StringExtractResult lastExtractResult;
/**
* Constructs new stream object that does conversion from file, to extracted
* strings, then to byte stream, for specified script, auto-detected
* encoding (UTF8, UTF16LE, UTF16BE), and specified output byte stream
* encoding
*
* @param content input content to process and turn into a stream to
* convert into strings
* @param scripts a list of scripts to consider
* @param extractUTF8 whether to extract utf8 encoding
* @param extractUTF16 whether to extract utf16 encoding
* @param outCharset encoding to use in the output byte stream
*/
public AbstractFileStringIntStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8,
boolean extractUTF16, Charset outCharset) {
this.content = content;
this.stringExtractor = new StringExtract();
this.stringExtractor.setEnabledScripts(scripts);
this.extractUTF8 = extractUTF8;
this.extractUTF16 = extractUTF16;
this.outCharset = outCharset;
this.stringExtractor.setEnableUTF8(extractUTF8);
this.stringExtractor.setEnableUTF16(extractUTF16);
}
@Override
public int read() throws IOException {
if (extractUTF8 == false && extractUTF16 == false) {
return -1;
}
final int read = read(oneCharBuf, 0, 1);
if (read == 1) {
return oneCharBuf[0];
} else {
return -1;
}
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
if (extractUTF8 == false && extractUTF16 == false) {
return -1;
}
long fileSize = content.getSize();
if (fileSize == 0) {
return -1;
}
//read and convert until user buffer full
//we have data if file can be read or when byteBuff has converted strings to return
int bytesToUser = 0; //returned to user so far
int offsetUser = off;
while (bytesToUser < len && offsetUser < len) {
//check if we have enough converted strings
int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
try {
//convert more strings, store in buffer
long toRead = 0;
//int shiftSize = 0;
//if (lastExtractResult != null && lastExtractResult.getTextLength() != 0
// && (shiftSize = FILE_BUF_SIZE - lastExtractResult.getFirstUnprocessedOff()) > 0) {
////a string previously extracted
////shift the fileReadBuff past last bytes extracted
////read only what's needed to fill the buffer
////to avoid loosing chars and breaking or corrupting potential strings - preserve byte stream continuity
//byte[] temp = new byte[shiftSize];
//System.arraycopy(fileReadBuff, lastExtractResult.getFirstUnprocessedOff(),
// temp, 0, shiftSize);
//System.arraycopy(temp, 0, fileReadBuff, 0, shiftSize);
//toRead = Math.min(lastExtractResult.getFirstUnprocessedOff(), fileSize - fileReadOffset);
//lastExtractResult = null;
//} else {
//fill up entire fileReadBuff fresh
toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
//}
int read = content.read(fileReadBuff, fileReadOffset, toRead);
if (read == -1 || read == 0) {
fileEOF = true;
} else {
fileReadOffset += read;
if (fileReadOffset >= fileSize) {
fileEOF = true;
}
//put converted string in convertBuff
convert(read);
convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
}
} catch (TskCoreException ex) {
//Exceptions.printStackTrace(ex);
fileEOF = true;
}
}
//nothing more to read, and no more bytes in convertBuff
if (convertBuff == null || convertBuffRemain == 0) {
if (fileEOF) {
return bytesToUser > 0 ? bytesToUser : -1;
} else {
//no strings extracted, try another read
continue;
}
}
//return part or all of convert buff to user
final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);
//DEBUG
/*
* if (toCopy > 0) { FileOutputStream debug = new
* FileOutputStream("c:\\temp\\" + content.getName(), true);
* debug.write(b, offsetUser, toCopy); debug.close(); }
*/
convertBuffOffset += toCopy;
offsetUser += toCopy;
bytesToUser += toCopy;
}
//if more string data in convertBuff, will be consumed on next read()
return bytesToUser;
}
/**
* convert bytes in file buffer to string, and encode string in
* convertBuffer
*
* @param numBytes num bytes in the fileReadBuff
*/
private void convert(int numBytes) {
lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
convertBuff = lastExtractResult.getText().getBytes(outCharset);
//reset tracking vars
if (lastExtractResult.getNumBytes() == 0) {
bytesInConvertBuff = 0;
} else {
bytesInConvertBuff = convertBuff.length;
}
convertBuffOffset = 0;
}
}