/* * Autopsy Forensic Browser * * Copyright 2011-2016 Basis Technology Corp. * Contact: carrier <at> sleuthkit <dot> org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sleuthkit.autopsy.keywordsearch; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; import java.util.logging.Level; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.SolrInputDocument; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.coreutils.Logger; import org.sleuthkit.autopsy.coreutils.TextUtil; import org.sleuthkit.autopsy.datamodel.ContentUtils; import org.sleuthkit.datamodel.AbstractContent; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.Content; import org.sleuthkit.datamodel.ContentVisitor; import org.sleuthkit.datamodel.DerivedFile; import org.sleuthkit.datamodel.Directory; import org.sleuthkit.datamodel.File; import org.sleuthkit.datamodel.LayoutFile; import org.sleuthkit.datamodel.LocalFile; import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.SlackFile; import org.sleuthkit.datamodel.TskCoreException; /** * Handles indexing files on a Solr core. */ class Ingester { private static final Logger logger = Logger.getLogger(Ingester.class.getName()); private volatile boolean uncommitedIngests = false; private final Server solrServer = KeywordSearch.getServer(); private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV(); private static Ingester instance; //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika) //TODO use a streaming way to add content to /update handler private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024; private static final String ENCODING = "UTF-8"; //NON-NLS private Ingester() { } public static synchronized Ingester getDefault() { if (instance == null) { instance = new Ingester(); } return instance; } @Override @SuppressWarnings("FinalizeDeclaration") protected void finalize() throws Throwable { super.finalize(); // Warn if files might have been left uncommited. if (uncommitedIngests) { logger.warning("Ingester was used to add files that it never committed."); //NON-NLS } } /** * Sends a stream to Solr to have its content extracted and added to the * index. commit() should be called once you're done ingesting files. * * @param afscs File AbstractFileStringContentStream to ingest * * @throws IngesterException if there was an error processing a specific * file, but the Solr server is probably fine. */ void ingest(AbstractFileStringContentStream afscs) throws IngesterException { Map<String, String> params = getContentFields(afscs.getSourceContent()); ingest(afscs, params, afscs.getSourceContent().getSize()); } /** * Sends a TextExtractor to Solr to have its content extracted and added to * the index. commit() should be called once you're done ingesting files. * FileExtract represents a parent of extracted file with actual content. * The parent itself has no content, only meta data and is used to associate * the extracted AbstractFileChunk * * @param fe TextExtractor to ingest * * @throws IngesterException if there was an error processing a specific * file, but the Solr server is probably fine. */ void ingest(TextExtractor fe) throws IngesterException { Map<String, String> params = getContentFields(fe.getSourceFile()); params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks())); ingest(new NullContentStream(fe.getSourceFile()), params, 0); } /** * Sends a AbstractFileChunk to Solr and its extracted content stream to be * added to the index. commit() should be called once you're done ingesting * files. AbstractFileChunk represents a file chunk and its chunk content. * * @param fec AbstractFileChunk to ingest * @param size approx. size of the stream in bytes, used for timeout * estimation * * @throws IngesterException if there was an error processing a specific * file, but the Solr server is probably fine. */ void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException { AbstractContent sourceContent = bcs.getSourceContent(); Map<String, String> params = getContentFields(sourceContent); //overwrite id with the chunk id params.put(Server.Schema.ID.toString(), Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber())); ingest(bcs, params, size); } /** * Sends a file to Solr to have its content extracted and added to the * index. commit() should be called once you're done ingesting files. If the * file is a directory or ingestContent is set to false, the file name is * indexed only. * * @param file File to ingest * @param ingestContent if true, index the file and the content, otherwise * indesx metadata only * * @throws IngesterException if there was an error processing a specific * file, but the Solr server is probably fine. */ void ingest(AbstractFile file, boolean ingestContent) throws IngesterException { if (ingestContent == false || file.isDir()) { ingest(new NullContentStream(file), getContentFields(file), 0); } else { ingest(new FscContentStream(file), getContentFields(file), file.getSize()); } } /** * Creates a field map from FsContent, that is later sent to Solr * * @param fsc FsContent to get fields from * * @return the map */ private Map<String, String> getContentFields(AbstractContent fsc) { return fsc.accept(getContentFieldsV); } /** * Visitor used to create param list to send to SOLR index. */ private class GetContentFieldsV extends ContentVisitor.Default<Map<String, String>> { @Override protected Map<String, String> defaultVisit(Content cntnt) { return new HashMap<>(); } @Override public Map<String, String> visit(File f) { Map<String, String> params = getCommonFields(f); getCommonFileContentFields(params, f); return params; } @Override public Map<String, String> visit(DerivedFile df) { Map<String, String> params = getCommonFields(df); getCommonFileContentFields(params, df); return params; } @Override public Map<String, String> visit(Directory d) { Map<String, String> params = getCommonFields(d); getCommonFileContentFields(params, d); return params; } @Override public Map<String, String> visit(LayoutFile lf) { // layout files do not have times return getCommonFields(lf); } @Override public Map<String, String> visit(LocalFile lf) { Map<String, String> params = getCommonFields(lf); getCommonFileContentFields(params, lf); return params; } @Override public Map<String, String> visit(SlackFile f) { Map<String, String> params = getCommonFields(f); getCommonFileContentFields(params, f); return params; } private Map<String, String> getCommonFileContentFields(Map<String, String> params, AbstractFile file) { params.put(Server.Schema.CTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCtime(), file)); params.put(Server.Schema.ATIME.toString(), ContentUtils.getStringTimeISO8601(file.getAtime(), file)); params.put(Server.Schema.MTIME.toString(), ContentUtils.getStringTimeISO8601(file.getMtime(), file)); params.put(Server.Schema.CRTIME.toString(), ContentUtils.getStringTimeISO8601(file.getCrtime(), file)); return params; } private Map<String, String> getCommonFields(AbstractFile af) { Map<String, String> params = new HashMap<>(); params.put(Server.Schema.ID.toString(), Long.toString(af.getId())); try { long dataSourceId = af.getDataSource().getId(); params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(dataSourceId)); } catch (TskCoreException ex) { logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS params.put(Server.Schema.IMAGE_ID.toString(), Long.toString(-1)); } params.put(Server.Schema.FILE_NAME.toString(), af.getName()); return params; } } /** * Indexing method that bypasses Tika, assumes pure text It reads and * converts the entire content stream to string, assuming UTF8 since we * can't use streaming approach for Solr /update handler. This should be * safe, since all content is now in max 1MB chunks. * * TODO see if can use a byte or string streaming way to add content to * /update handler e.g. with XMLUpdateRequestHandler (deprecated in SOlr * 4.0.0), see if possible to stream with UpdateRequestHandler * * @param cs * @param fields * @param size * * @throws org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException */ void ingest(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException { if (fields.get(Server.Schema.IMAGE_ID.toString()) == null) { //skip the file, image id unknown String msg = NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.unknownImgId.msg", cs.getName()); logger.log(Level.SEVERE, msg); throw new IngesterException(msg); } final byte[] docChunkContentBuf = new byte[MAX_DOC_CHUNK_SIZE]; SolrInputDocument updateDoc = new SolrInputDocument(); for (String key : fields.keySet()) { updateDoc.addField(key, fields.get(key)); } //using size here, but we are no longer ingesting entire files //size is normally a chunk size, up to 1MB if (size > 0) { // TODO (RC): Use try with resources, adjust exception messages InputStream is = null; int read = 0; try { is = cs.getStream(); read = is.read(docChunkContentBuf); } catch (IOException ex) { throw new IngesterException( NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.cantReadStream.msg", cs.getName())); } finally { if (null != is) { try { is.close(); } catch (IOException ex) { logger.log(Level.WARNING, "Could not close input stream after reading content, " + cs.getName(), ex); //NON-NLS } } } if (read != 0) { String s = ""; try { s = new String(docChunkContentBuf, 0, read, ENCODING); // Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index char[] chars = null; for (int i = 0; i < s.length(); i++) { if (!TextUtil.isValidSolrUTF8(s.charAt(i))) { // only convert string to char[] if there is a non-UTF8 character if (chars == null) { chars = s.toCharArray(); } chars[i] = '^'; } } // check if the string was modified (i.e. there was a non-UTF8 character found) if (chars != null) { s = new String(chars); } } catch (UnsupportedEncodingException ex) { logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS } updateDoc.addField(Server.Schema.CONTENT.toString(), s); } else { updateDoc.addField(Server.Schema.CONTENT.toString(), ""); } } else { //no content, such as case when 0th chunk indexed updateDoc.addField(Server.Schema.CONTENT.toString(), ""); } try { //TODO consider timeout thread, or vary socket timeout based on size of indexed content solrServer.addDocument(updateDoc); uncommitedIngests = true; } catch (KeywordSearchModuleException ex) { throw new IngesterException( NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", cs.getName()), ex); } } /** * return timeout that should be used to index the content * * @param size size of the content * * @return time in seconds to use a timeout */ static int getTimeout(long size) { if (size < 1024 * 1024L) //1MB { return 60; } else if (size < 10 * 1024 * 1024L) //10MB { return 1200; } else if (size < 100 * 1024 * 1024L) //100MB { return 3600; } else { return 3 * 3600; } } /** * Tells Solr to commit (necessary before ingested files will appear in * searches) */ void commit() { try { solrServer.commit(); uncommitedIngests = false; } catch (NoOpenCoreException | SolrServerException ex) { logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS } } /** * ContentStream to read() the data from a FsContent object */ private static class FscContentStream implements ContentStream { private AbstractFile f; FscContentStream(AbstractFile f) { this.f = f; } @Override public String getName() { return f.getName(); } @Override public String getSourceInfo() { return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId()); } @Override public String getContentType() { return null; } @Override public Long getSize() { return f.getSize(); } @Override public InputStream getStream() throws IOException { return new ReadContentInputStream(f); } @Override public Reader getReader() throws IOException { throw new UnsupportedOperationException( NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader")); } } /** * ContentStream associated with FsContent, but forced with no content */ private static class NullContentStream implements ContentStream { AbstractContent aContent; NullContentStream(AbstractContent aContent) { this.aContent = aContent; } @Override public String getName() { return aContent.getName(); } @Override public String getSourceInfo() { return NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getSrcInfo.text", aContent.getId()); } @Override public String getContentType() { return null; } @Override public Long getSize() { return 0L; } @Override public InputStream getStream() throws IOException { return new ByteArrayInputStream(new byte[0]); } @Override public Reader getReader() throws IOException { throw new UnsupportedOperationException( NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader")); } } /** * Indicates that there was an error with the specific ingest operation, but * it's still okay to continue ingesting files. */ static class IngesterException extends Exception { private static final long serialVersionUID = 1L; IngesterException(String message, Throwable ex) { super(message, ex); } IngesterException(String message) { super(message); } } }