AbstractFileStringIntStream.java example

Explorer
autopsy-master
/*
 * Autopsy Forensic Browser
 *
 * Copyright 2012 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sleuthkit.autopsy.keywordsearch;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.List;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractResult;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskCoreException;

/**
 * Wrapper over StringExtract to provide streaming API Given AbstractFile
 * object, extract international strings from the file and read output as a
 * stream of UTF-8 strings as encoded bytes.
 *
 */
class AbstractFileStringIntStream extends InputStream {

    private static final Logger logger = Logger.getLogger(AbstractFileStringIntStream.class.getName());
    private static final int FILE_BUF_SIZE = 1024 * 1024;
    private AbstractFile content;
    private final byte[] oneCharBuf = new byte[1];
    private final StringExtract stringExtractor;
    private final byte[] fileReadBuff = new byte[FILE_BUF_SIZE];
    private long fileReadOffset = 0L;
    private byte[] convertBuff; //stores extracted string encoded as bytes, before returned to user
    private int convertBuffOffset = 0; //offset to start returning data to user on next read()
    private int bytesInConvertBuff = 0; //amount of data currently in the buffer
    private boolean fileEOF = false; //if file has more bytes to read
    private boolean extractUTF8;
    private boolean extractUTF16;
    private Charset outCharset;

    private StringExtractResult lastExtractResult;

    /**
     * Constructs new stream object that does conversion from file, to extracted
     * strings, then to byte stream, for specified script, auto-detected
     * encoding (UTF8, UTF16LE, UTF16BE), and specified output byte stream
     * encoding
     *
     * @param content      input content to process and turn into a stream to
     *                     convert into strings
     * @param scripts      a list of scripts to consider
     * @param extractUTF8  whether to extract utf8 encoding
     * @param extractUTF16 whether to extract utf16 encoding
     * @param outCharset   encoding to use in the output byte stream
     */
    public AbstractFileStringIntStream(AbstractFile content, List<SCRIPT> scripts, boolean extractUTF8,
            boolean extractUTF16, Charset outCharset) {
        this.content = content;
        this.stringExtractor = new StringExtract();
        this.stringExtractor.setEnabledScripts(scripts);
        this.extractUTF8 = extractUTF8;
        this.extractUTF16 = extractUTF16;
        this.outCharset = outCharset;
        this.stringExtractor.setEnableUTF8(extractUTF8);
        this.stringExtractor.setEnableUTF16(extractUTF16);
    }

    @Override
    public int read() throws IOException {
        if (extractUTF8 == false && extractUTF16 == false) {
            return -1;
        }
        final int read = read(oneCharBuf, 0, 1);
        if (read == 1) {
            return oneCharBuf[0];
        } else {
            return -1;
        }

    }

    @Override
    public int read(byte[] b, int off, int len) throws IOException {
        if (b == null) {
            throw new NullPointerException();
        } else if (off < 0 || len < 0 || len > b.length - off) {
            throw new IndexOutOfBoundsException();
        } else if (len == 0) {
            return 0;
        }

        if (extractUTF8 == false && extractUTF16 == false) {
            return -1;
        }

        long fileSize = content.getSize();
        if (fileSize == 0) {
            return -1;
        }

        //read and convert until user buffer full
        //we have data if file can be read or when byteBuff has converted strings to return
        int bytesToUser = 0; //returned to user so far
        int offsetUser = off;
        while (bytesToUser < len && offsetUser < len) {
            //check if we have enough converted strings         
            int convertBuffRemain = bytesInConvertBuff - convertBuffOffset;

            if ((convertBuff == null || convertBuffRemain == 0) && !fileEOF && fileReadOffset < fileSize) {
                try {
                    //convert more strings, store in buffer
                    long toRead = 0;
                    //int shiftSize = 0;

                    //if (lastExtractResult != null && lastExtractResult.getTextLength() != 0
                    //      && (shiftSize = FILE_BUF_SIZE - lastExtractResult.getFirstUnprocessedOff()) > 0) {
                    ////a string previously extracted
                    ////shift the fileReadBuff past last bytes extracted
                    ////read only what's needed to fill the buffer
                    ////to avoid loosing chars and breaking or corrupting potential strings - preserve byte stream continuity
                    //byte[] temp = new byte[shiftSize];
                    //System.arraycopy(fileReadBuff, lastExtractResult.getFirstUnprocessedOff(),
                    //        temp, 0, shiftSize);
                    //System.arraycopy(temp, 0, fileReadBuff, 0, shiftSize);
                    //toRead = Math.min(lastExtractResult.getFirstUnprocessedOff(), fileSize - fileReadOffset);
                    //lastExtractResult = null;
                    //} else { 
                    //fill up entire fileReadBuff fresh
                    toRead = Math.min(FILE_BUF_SIZE, fileSize - fileReadOffset);
                    //}
                    int read = content.read(fileReadBuff, fileReadOffset, toRead);
                    if (read == -1 || read == 0) {
                        fileEOF = true;
                    } else {
                        fileReadOffset += read;
                        if (fileReadOffset >= fileSize) {
                            fileEOF = true;
                        }

                        //put converted string in convertBuff
                        convert(read);
                        convertBuffRemain = bytesInConvertBuff - convertBuffOffset;
                    }
                } catch (TskCoreException ex) {
                    //Exceptions.printStackTrace(ex);
                    fileEOF = true;
                }
            }

            //nothing more to read, and no more bytes in convertBuff
            if (convertBuff == null || convertBuffRemain == 0) {
                if (fileEOF) {
                    return bytesToUser > 0 ? bytesToUser : -1;
                } else {
                    //no strings extracted, try another read
                    continue;
                }
            }

            //return part or all of convert buff to user
            final int toCopy = Math.min(convertBuffRemain, len - offsetUser);
            System.arraycopy(convertBuff, convertBuffOffset, b, offsetUser, toCopy);

            //DEBUG
            /*
             * if (toCopy > 0) { FileOutputStream debug = new
             * FileOutputStream("c:\\temp\\" + content.getName(), true);
             * debug.write(b, offsetUser, toCopy); debug.close(); }
             */
            convertBuffOffset += toCopy;
            offsetUser += toCopy;

            bytesToUser += toCopy;

        }

        //if more string data in convertBuff, will be consumed on next read()
        return bytesToUser;
    }

    /**
     * convert bytes in file buffer to string, and encode string in
     * convertBuffer
     *
     * @param numBytes num bytes in the fileReadBuff
     */
    private void convert(int numBytes) {
        lastExtractResult = stringExtractor.extract(fileReadBuff, numBytes, 0);
        convertBuff = lastExtractResult.getText().getBytes(outCharset);

        //reset tracking vars
        if (lastExtractResult.getNumBytes() == 0) {
            bytesInConvertBuff = 0;
        } else {
            bytesInConvertBuff = convertBuff.length;
        }
        convertBuffOffset = 0;
    }
}