Ingester.java example

Explorer
autopsy-master
/*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2016 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sleuthkit.autopsy.keywordsearch;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.SolrInputDocument;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.TextUtil;
import org.sleuthkit.autopsy.datamodel.ContentUtils;
import org.sleuthkit.datamodel.AbstractContent;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.Content;
import org.sleuthkit.datamodel.ContentVisitor;
import org.sleuthkit.datamodel.DerivedFile;
import org.sleuthkit.datamodel.Directory;
import org.sleuthkit.datamodel.File;
import org.sleuthkit.datamodel.LayoutFile;
import org.sleuthkit.datamodel.LocalFile;
import org.sleuthkit.datamodel.ReadContentInputStream;
import org.sleuthkit.datamodel.SlackFile;
import org.sleuthkit.datamodel.TskCoreException;

/**
 * Handles indexing files on a Solr core.
 */
class Ingester {

    private static final Logger logger = Logger.getLogger(Ingester.class.getName());
    private volatile boolean uncommitedIngests = false;
    private final Server solrServer = KeywordSearch.getServer();
    private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
    private static Ingester instance;

    //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika)
    //TODO use a streaming way to add content to /update handler
    private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024;
    private static final String ENCODING = "UTF-8"; //NON-NLS

    private Ingester() {
    }

    public static synchronized Ingester getDefault() {
        if (instance == null) {
            instance = new Ingester();
        }
        return instance;
    }

    @Override
    @SuppressWarnings("FinalizeDeclaration")
    protected void finalize() throws Throwable {
        super.finalize();

        // Warn if files might have been left uncommited.
        if (uncommitedIngests) {
            logger.warning("Ingester was used to add files that it never committed."); //NON-NLS
        }
    }

    /**
     * Sends a stream to Solr to have its content extracted and added to the
     * index. commit() should be called once you're done ingesting files.
     *
     * @param afscs File AbstractFileStringContentStream to ingest
     *
     * @throws IngesterException if there was an error processing a specific
     *                           file, but the Solr server is probably fine.
     */
    void ingest(AbstractFileStringContentStream afscs) throws IngesterException {
        Map<String, String> params = getContentFields(afscs.getSourceContent());
        ingest(afscs, params, afscs.getSourceContent().getSize());
    }

    /**
     * Sends a TextExtractor to Solr to have its content extracted and added to
     * the index. commit() should be called once you're done ingesting files.
     * FileExtract represents a parent of extracted file with actual content.
     * The parent itself has no content, only meta data and is used to associate
     * the extracted AbstractFileChunk
     *
     * @param fe TextExtractor to ingest
     *
     * @throws IngesterException if there was an error processing a specific
     *                           file, but the Solr server is probably fine.
     */
    void ingest(TextExtractor fe) throws IngesterException {
        Map<String, String> params = getContentFields(fe.getSourceFile());

        params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));

        ingest(new NullContentStream(fe.getSourceFile()), params, 0);
    }

    /**
     * Sends a AbstractFileChunk to Solr and its extracted content stream to be
     * added to the index. commit() should be called once you're done ingesting
     * files. AbstractFileChunk represents a file chunk and its chunk content.
     *
     * @param fec  AbstractFileChunk to ingest
     * @param size approx. size of the stream in bytes, used for timeout
     *             estimation
     *
     * @throws IngesterException if there was an error processing a specific
     *                           file, but the Solr server is probably fine.
     */
    void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException {
        AbstractContent sourceContent = bcs.getSourceContent();
        Map<String, String> params = getContentFields(sourceContent);

        //overwrite id with the chunk id
        params.put(Server.Schema.ID.toString(),
                Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber()));

        ingest(bcs, params, size);
    }

    /**
     * Sends a file to Solr to have its content extracted and added to the
     * index. commit() should be called once you're done ingesting files. If the
     * file is a directory or ingestContent is set to false, the file name is
     * indexed only.
     *
     * @param file          File to ingest
     * @param ingestContent if true, index the file and the content, otherwise
     *                      indesx metadata only
     *
     * @throws IngesterException if there was an error processing a specific
     *                           file, but the Solr server is probably fine.
     */
    void ingest(AbstractFile file, boolean ingestContent) throws IngesterException {
        if (ingestContent == false || file.isDir()) {
            ingest(new NullContentStream(file), getContentFields(file), 0);
        } else {
            ingest(new FscContentStream(file), getContentFields(file), file.getSize());
        }
    }

    /**
     * Creates a field map from FsContent, that is later sent to Solr
     *
     * @param fsc FsContent to get fields from
     *
     * @return the map
     */
    private Map<String, String> getContentFields(AbstractContent fsc) {
        return fsc.accept(getContentFieldsV);
    }

    /**
     * Visitor used to create param list to send to SOLR index.
     */
    private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> {

        @Override
        protected Map<String, String> defaultVisit(Content cntnt) {
            return new HashMap<>();
        }

        @Override
        public Map<String, String> visit(File f) {
            Map<String, String> params = getCommonFields(f);
            getCommonFileContentFields(params, f);
            return params;
        }

        @Override
        public Map<String, String> visit(DerivedFile df) {
            Map<String, String> params = getCommonFields(df);
            getCommonFileContentFields(params, df);
            return params;
        }

        @Override
        public Map<String, String> visit(Directory d) {
            Map<String, String> params = getCommonFields(d);
            getCommonFileContentFields(params, d);
            return params;
        }

        @Override
        public Map<String, String> visit(LayoutFile lf) {
            // layout files do not have times
            return getCommonFields(lf);
        }

        @Override
        public Map<String, String> visit(LocalFile lf) {
            Map<String, String> params = getCommonFields(lf);
            getCommonFileContentFields(params, lf);
            return params;
        }

        @Override
        public Map<String, String> visit(SlackFile f) {
            Map<String, String> params = getCommonFields(f);
            getCommonFileContentFields(params, f);
            return params;
        }

        private Map<String, String> getCommonFileContentFields(Map<String, String> params, AbstractFile file) {
            params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file));
            params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file));
            params.put(Server.Schema.MTIME.toString(), ContentUtils.getStringTimeISO8601(file.getMtime(), file));
            params.put(Server.Schema.CRTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCrtime(), file));
            return params;
        }

        private Map<String, String> getCommonFields(AbstractFile af) {
            Map<String, String> params = new HashMap<>();
            params.put(Server.Schema.ID.toString(), Long.toString(af.getId()));
            try {
                long dataSourceId = af.getDataSource().getId();
                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
            } catch (TskCoreException ex) {
                logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS
                params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1));
            }

            params.put(Server.Schema.FILE_NAME.toString(), af.getName());
            return params;
        }
    }

    /**
     * Indexing method that bypasses Tika, assumes pure text It reads and
     * converts the entire content stream to string, assuming UTF8 since we
     * can't use streaming approach for Solr /update handler. This should be
     * safe, since all content is now in max 1MB chunks.
     *
     * TODO see if can use a byte or string streaming way to add content to
     * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr
     * 4.0.0), see if possible to stream with UpdateRequestHandler
     *
     * @param cs
     * @param fields
     * @param size
     *
     * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException
     */
    void ingest(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
        if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) {
            //skip the file, image id unknown
            String msg = NbBundle.getMessage(this.getClass(),
                    "Ingester.ingest.exception.unknownImgId.msg", cs.getName());
            logger.log(Level.SEVERE, msg);
            throw new IngesterException(msg);
        }

        final byte[] docChunkContentBuf = new byte[MAX_DOC_CHUNK_SIZE];
        SolrInputDocument updateDoc = new SolrInputDocument();

        for (String key : fields.keySet()) {
            updateDoc.addField(key, fields.get(key));
        }

        //using size here, but we are no longer ingesting entire files
        //size is normally a chunk size, up to 1MB
        if (size > 0) {
            // TODO (RC): Use try with resources, adjust exception messages
            InputStream is = null;
            int read = 0;
            try {
                is = cs.getStream();
                read = is.read(docChunkContentBuf);
            } catch (IOException ex) {
                throw new IngesterException(
                        NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.cantReadStream.msg",
                                cs.getName()));
            } finally {
                if (null != is) {
                    try {
                        is.close();
                    } catch (IOException ex) {
                        logger.log(Level.WARNING, "Could not close input stream after reading content, " + cs.getName(), ex); //NON-NLS
                    }
                }
            }

            if (read != 0) {
                String s = "";
                try {
                    s = new String(docChunkContentBuf, 0, read, ENCODING);
                    // Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index
                    char[] chars = null;
                    for (int i = 0; i < s.length(); i++) {
                        if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
                            // only convert string to char[] if there is a non-UTF8 character
                            if (chars == null) {
                                chars = s.toCharArray();
                            }
                            chars[i] = '^';
                        }
                    }
                    // check if the string was modified (i.e. there was a non-UTF8 character found)
                    if (chars != null) {
                        s = new String(chars);
                    }
                } catch (UnsupportedEncodingException ex) {
                    logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS
                }
                updateDoc.addField(Server.Schema.CONTENT.toString(), s);
            } else {
                updateDoc.addField(Server.Schema.CONTENT.toString(), "");
            }
        } else {
            //no content, such as case when 0th chunk indexed
            updateDoc.addField(Server.Schema.CONTENT.toString(), "");
        }

        try {
            //TODO consider timeout thread, or vary socket timeout based on size of indexed content
            solrServer.addDocument(updateDoc);
            uncommitedIngests = true;
        } catch (KeywordSearchModuleException ex) {
            throw new IngesterException(
                    NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", cs.getName()), ex);
        }

    }

    /**
     * return timeout that should be used to index the content
     *
     * @param size size of the content
     *
     * @return time in seconds to use a timeout
     */
    static int getTimeout(long size) {
        if (size < 1024 * 1024L) //1MB
        {
            return 60;
        } else if (size < 10 * 1024 * 1024L) //10MB
        {
            return 1200;
        } else if (size < 100 * 1024 * 1024L) //100MB
        {
            return 3600;
        } else {
            return 3 * 3600;
        }

    }

    /**
     * Tells Solr to commit (necessary before ingested files will appear in
     * searches)
     */
    void commit() {
        try {
            solrServer.commit();
            uncommitedIngests = false;
        } catch (NoOpenCoreException | SolrServerException ex) {
            logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
        }
    }

    /**
     * ContentStream to read() the data from a FsContent object
     */
    private static class FscContentStream implements ContentStream {

        private AbstractFile f;

        FscContentStream(AbstractFile f) {
            this.f = f;
        }

        @Override
        public String getName() {
            return f.getName();
        }

        @Override
        public String getSourceInfo() {
            return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId());
        }

        @Override
        public String getContentType() {
            return null;
        }

        @Override
        public Long getSize() {
            return f.getSize();
        }

        @Override
        public InputStream getStream() throws IOException {
            return new ReadContentInputStream(f);
        }

        @Override
        public Reader getReader() throws IOException {
            throw new UnsupportedOperationException(
                    NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader"));
        }
    }

    /**
     * ContentStream associated with FsContent, but forced with no content
     */
    private static class NullContentStream implements ContentStream {

        AbstractContent aContent;

        NullContentStream(AbstractContent aContent) {
            this.aContent = aContent;
        }

        @Override
        public String getName() {
            return aContent.getName();
        }

        @Override
        public String getSourceInfo() {
            return NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getSrcInfo.text", aContent.getId());
        }

        @Override
        public String getContentType() {
            return null;
        }

        @Override
        public Long getSize() {
            return 0L;
        }

        @Override
        public InputStream getStream() throws IOException {
            return new ByteArrayInputStream(new byte[0]);
        }

        @Override
        public Reader getReader() throws IOException {
            throw new UnsupportedOperationException(
                    NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
        }
    }

    /**
     * Indicates that there was an error with the specific ingest operation, but
     * it's still okay to continue ingesting files.
     */
    static class IngesterException extends Exception {

        private static final long serialVersionUID = 1L;

        IngesterException(String message, Throwable ex) {
            super(message, ex);
        }

        IngesterException(String message) {
            super(message);
        }
    }
}