/*
* Autopsy Forensic Browser
*
* Copyright 2011-2015 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.core.UserPreferences;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.MessageNotifyUtil;
import org.sleuthkit.autopsy.ingest.FileIngestModule;
import org.sleuthkit.autopsy.ingest.IngestJobContext;
import org.sleuthkit.autopsy.ingest.IngestMessage;
import org.sleuthkit.autopsy.ingest.IngestMessage.MessageType;
import org.sleuthkit.autopsy.ingest.IngestModuleReferenceCounter;
import org.sleuthkit.autopsy.ingest.IngestServices;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchService;
import org.sleuthkit.autopsy.keywordsearchservice.KeywordSearchServiceException;
import org.sleuthkit.autopsy.modules.filetypeid.FileTypeDetector;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.TskCoreException;
import org.sleuthkit.datamodel.TskData;
import org.sleuthkit.datamodel.TskData.FileKnown;
/**
* An ingest module on a file level Performs indexing of allocated and Solr
* supported files, string extraction and indexing of unallocated and not Solr
* supported files Index commit is done periodically (determined by user set
* ingest update interval) Runs a periodic keyword / regular expression search
* on currently configured lists for ingest and writes results to blackboard
* Reports interesting events to Inbox and to viewers
*/
@NbBundle.Messages({
"# {0} - Reason for not starting Solr", "KeywordSearchIngestModule.init.tryStopSolrMsg={0}<br />Please try stopping Java Solr processes if any exist and restart the application.",
"KeywordSearchIngestModule.init.badInitMsg=Keyword search server was not properly initialized, cannot run keyword search ingest.",
"SolrConnectionCheck.Port=Invalid port number.",
"# {0} - Reason for not connecting to Solr", "KeywordSearchIngestModule.init.exception.errConnToSolr.msg=Error connecting to SOLR server: {0}.",
"KeywordSearchIngestModule.startUp.noOpenCore.msg=The index could not be opened or does not exist.",
"CannotRunFileTypeDetection=Unable to run file type detection."
})
public final class KeywordSearchIngestModule implements FileIngestModule {
enum UpdateFrequency {
FAST(20),
AVG(10),
SLOW(5),
SLOWEST(1),
NONE(Integer.MAX_VALUE),
DEFAULT(5);
private final int time;
UpdateFrequency(int time) {
this.time = time;
}
int getTime() {
return time;
}
};
private static final Logger logger = Logger.getLogger(KeywordSearchIngestModule.class.getName());
private final IngestServices services = IngestServices.getInstance();
private Ingester ingester = null;
private Indexer indexer;
private FileTypeDetector fileTypeDetector;
//only search images from current ingest, not images previously ingested/indexed
//accessed read-only by searcher thread
private boolean startedSearching = false;
private List<TextExtractor> textExtractors;
private StringsTextExtractor stringExtractor;
private final KeywordSearchJobSettings settings;
private boolean initialized = false;
private long jobId;
private long dataSourceId;
private static final AtomicInteger instanceCount = new AtomicInteger(0); //just used for logging
private int instanceNum = 0;
private static final IngestModuleReferenceCounter refCounter = new IngestModuleReferenceCounter();
private IngestJobContext context;
private enum IngestStatus {
TEXT_INGESTED, /// Text was extracted by knowing file type and text_ingested
STRINGS_INGESTED, ///< Strings were extracted from file
METADATA_INGESTED, ///< No content, so we just text_ingested metadata
SKIPPED_ERROR_INDEXING, ///< File was skipped because index engine had problems
SKIPPED_ERROR_TEXTEXTRACT, ///< File was skipped because of text extraction issues
SKIPPED_ERROR_IO ///< File was skipped because of IO issues reading it
};
private static final Map<Long, Map<Long, IngestStatus>> ingestStatus = new HashMap<>(); //guarded by itself
/**
* Records the ingest status for a given file for a given ingest job. Used
* for final statistics at the end of the job.
* @param ingestJobId id of ingest job
* @param fileId id of file
* @param status ingest status of the file
*/
private static void putIngestStatus(long ingestJobId, long fileId, IngestStatus status) {
synchronized (ingestStatus) {
Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(ingestJobId);
if (ingestStatusForJob == null) {
ingestStatusForJob = new HashMap<>();
ingestStatus.put(ingestJobId, ingestStatusForJob);
}
ingestStatusForJob.put(fileId, status);
ingestStatus.put(ingestJobId, ingestStatusForJob);
}
}
KeywordSearchIngestModule(KeywordSearchJobSettings settings) {
this.settings = settings;
instanceNum = instanceCount.getAndIncrement();
}
/**
* Initializes the module for new ingest run Sets up threads, timers,
* retrieves settings, keyword lists to run on
*
*/
@Override
public void startUp(IngestJobContext context) throws IngestModuleException {
initialized = false;
jobId = context.getJobId();
dataSourceId = context.getDataSource().getId();
Server server = KeywordSearch.getServer();
if (server.coreIsOpen() == false) {
throw new IngestModuleException(Bundle.KeywordSearchIngestModule_startUp_noOpenCore_msg());
}
try {
fileTypeDetector = new FileTypeDetector();
} catch (FileTypeDetector.FileTypeDetectorInitException ex) {
throw new IngestModuleException(Bundle.CannotRunFileTypeDetection(), ex);
}
ingester = Server.getIngester();
this.context = context;
// increment the module reference count
// if first instance of this module for this job then check the server and existence of keywords
if (refCounter.incrementAndGet(jobId) == 1) {
if (Case.getCurrentCase().getCaseType() == Case.CaseType.MULTI_USER_CASE) {
// for multi-user cases need to verify connection to remore SOLR server
KeywordSearchService kwsService = new SolrSearchService();
int port;
try {
port = Integer.parseInt(UserPreferences.getIndexingServerPort());
} catch (NumberFormatException ex) {
// if there is an error parsing the port number
throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg() + " " + Bundle.SolrConnectionCheck_Port(), ex);
}
try {
kwsService.tryConnect(UserPreferences.getIndexingServerHost(), port);
} catch (KeywordSearchServiceException ex) {
throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_badInitMsg(), ex);
}
} else {
// for single-user cases need to verify connection to local SOLR service
try {
if (!server.isRunning()) {
throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()));
}
} catch (KeywordSearchModuleException ex) {
//this means Solr is not properly initialized
throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_tryStopSolrMsg(Bundle.KeywordSearchIngestModule_init_badInitMsg()), ex);
}
try {
// make an actual query to verify that server is responding
// we had cases where getStatus was OK, but the connection resulted in a 404
server.queryNumIndexedDocuments();
} catch (KeywordSearchModuleException | NoOpenCoreException ex) {
throw new IngestModuleException(Bundle.KeywordSearchIngestModule_init_exception_errConnToSolr_msg(ex.getMessage()), ex);
}
// check if this job has any searchable keywords
List<KeywordList> keywordLists = XmlKeywordSearchList.getCurrent().getListsL();
boolean hasKeywordsForSearch = false;
for (KeywordList keywordList : keywordLists) {
if (settings.keywordListIsEnabled(keywordList.getName()) && !keywordList.getKeywords().isEmpty()) {
hasKeywordsForSearch = true;
break;
}
}
if (!hasKeywordsForSearch) {
services.postMessage(IngestMessage.createWarningMessage(KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.noKwInLstMsg"),
NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.init.onlyIdxKwSkipMsg")));
}
}
}
//initialize extractors
stringExtractor = new StringsTextExtractor();
stringExtractor.setScripts(KeywordSearchSettings.getStringExtractScripts());
stringExtractor.setOptions(KeywordSearchSettings.getStringExtractOptions());
textExtractors = new ArrayList<>();
//order matters, more specific extractors first
textExtractors.add(new HtmlTextExtractor());
textExtractors.add(new TikaTextExtractor());
indexer = new Indexer();
initialized = true;
}
@Override
public ProcessResult process(AbstractFile abstractFile) {
if (initialized == false) //error initializing indexing/Solr
{
logger.log(Level.WARNING, "Skipping processing, module not initialized, file: {0}", abstractFile.getName()); //NON-NLS
putIngestStatus(jobId, abstractFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
return ProcessResult.OK;
}
if (abstractFile.getType().equals(TskData.TSK_DB_FILES_TYPE_ENUM.VIRTUAL_DIR)) {
//skip indexing of virtual dirs (no content, no real name) - will index children files
return ProcessResult.OK;
}
if (KeywordSearchSettings.getSkipKnown() && abstractFile.getKnown().equals(FileKnown.KNOWN)) {
//index meta-data only
if (context.fileIngestIsCancelled()) {
return ProcessResult.OK;
}
indexer.indexFile(abstractFile, false);
return ProcessResult.OK;
}
//index the file and content (if the content is supported)
if (context.fileIngestIsCancelled()) {
return ProcessResult.OK;
}
indexer.indexFile(abstractFile, true);
// Start searching if it hasn't started already
if (!startedSearching) {
if (context.fileIngestIsCancelled()) {
return ProcessResult.OK;
}
List<String> keywordListNames = settings.getNamesOfEnabledKeyWordLists();
SearchRunner.getInstance().startJob(jobId, dataSourceId, keywordListNames);
startedSearching = true;
}
return ProcessResult.OK;
}
/**
* After all files are ingested, execute final index commit and final search
* Cleanup resources, threads, timers
*/
@Override
public void shutDown() {
logger.log(Level.INFO, "Instance {0}", instanceNum); //NON-NLS
if ((initialized == false) || (context == null)) {
return;
}
if (context.fileIngestIsCancelled()) {
stop();
return;
}
// Remove from the search list and trigger final commit and final search
SearchRunner.getInstance().endJob(jobId);
// We only need to post the summary msg from the last module per job
if (refCounter.decrementAndGet(jobId) == 0) {
postIndexSummary();
synchronized (ingestStatus) {
ingestStatus.remove(jobId);
}
}
//log number of files / chunks in index
//signal a potential change in number of text_ingested files
try {
final int numIndexedFiles = KeywordSearch.getServer().queryNumIndexedFiles();
final int numIndexedChunks = KeywordSearch.getServer().queryNumIndexedChunks();
logger.log(Level.INFO, "Indexed files count: {0}", numIndexedFiles); //NON-NLS
logger.log(Level.INFO, "Indexed file chunks count: {0}", numIndexedChunks); //NON-NLS
} catch (NoOpenCoreException | KeywordSearchModuleException ex) {
logger.log(Level.WARNING, "Error executing Solr query to check number of indexed files/chunks: ", ex); //NON-NLS
}
cleanup();
}
/**
* Handle stop event (ingest interrupted) Cleanup resources, threads, timers
*/
private void stop() {
logger.log(Level.INFO, "stop()"); //NON-NLS
SearchRunner.getInstance().stopJob(jobId);
cleanup();
}
/**
* Common cleanup code when module stops or final searcher completes
*/
private void cleanup() {
textExtractors.clear();
textExtractors = null;
stringExtractor = null;
initialized = false;
}
/**
* Posts inbox message with summary of text_ingested files
*/
private void postIndexSummary() {
int text_ingested = 0;
int metadata_ingested = 0;
int strings_ingested = 0;
int error_text = 0;
int error_index = 0;
int error_io = 0;
synchronized (ingestStatus) {
Map<Long, IngestStatus> ingestStatusForJob = ingestStatus.get(jobId);
if (ingestStatusForJob == null) {
return;
}
for (IngestStatus s : ingestStatusForJob.values()) {
switch (s) {
case TEXT_INGESTED:
text_ingested++;
break;
case METADATA_INGESTED:
metadata_ingested++;
break;
case STRINGS_INGESTED:
strings_ingested++;
break;
case SKIPPED_ERROR_TEXTEXTRACT:
error_text++;
break;
case SKIPPED_ERROR_INDEXING:
error_index++;
break;
case SKIPPED_ERROR_IO:
error_io++;
break;
default:
;
}
}
}
StringBuilder msg = new StringBuilder();
msg.append("<table border=0><tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.knowFileHeaderLbl")).append("</td><td>").append(text_ingested).append("</td></tr>"); //NON-NLS
msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.fileGenStringsHead")).append("</td><td>").append(strings_ingested).append("</td></tr>"); //NON-NLS
msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.mdOnlyLbl")).append("</td><td>").append(metadata_ingested).append("</td></tr>"); //NON-NLS
msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrLbl")).append("</td><td>").append(error_index).append("</td></tr>"); //NON-NLS
msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errTxtLbl")).append("</td><td>").append(error_text).append("</td></tr>"); //NON-NLS
msg.append("<tr><td>").append(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.errIoLbl")).append("</td><td>").append(error_io).append("</td></tr>"); //NON-NLS
msg.append("</table>"); //NON-NLS
String indexStats = msg.toString();
logger.log(Level.INFO, "Keyword Indexing Completed: {0}", indexStats); //NON-NLS
services.postMessage(IngestMessage.createMessage(MessageType.INFO, KeywordSearchModuleFactory.getModuleName(), NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxResultsLbl"), indexStats));
if (error_index > 0) {
MessageNotifyUtil.Notify.error(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrsTitle"),
NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxErrMsgFiles", error_index));
} else if (error_io + error_text > 0) {
MessageNotifyUtil.Notify.warn(NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.kwIdxWarnMsgTitle"),
NbBundle.getMessage(this.getClass(), "KeywordSearchIngestModule.postIndexSummary.idxErrReadFilesMsg"));
}
}
/**
* File indexer, processes and indexes known/allocated files,
* unknown/unallocated files and directories accordingly
*/
private class Indexer {
private final Logger logger = Logger.getLogger(Indexer.class.getName());
/**
* Extract text with Tika or other text extraction modules (by
* streaming) from the file Divide the file into chunks and index the
* chunks
*
* @param aFile file to extract strings from, divide into
* chunks and index
* @param detectedFormat mime-type detected, or null if none detected
*
* @return true if the file was text_ingested, false otherwise
*
* @throws IngesterException exception thrown if indexing failed
*/
private boolean extractTextAndIndex(AbstractFile aFile, String detectedFormat) throws IngesterException {
TextExtractor fileExtract = null;
//go over available text extractors in order, and pick the first one (most specific one)
for (TextExtractor fe : textExtractors) {
if (fe.isSupported(aFile, detectedFormat)) {
fileExtract = fe;
break;
}
}
if (fileExtract == null) {
logger.log(Level.INFO, "No text extractor found for file id:{0}, name: {1}, detected format: {2}", new Object[]{aFile.getId(), aFile.getName(), detectedFormat}); //NON-NLS
return false;
}
//logger.log(Level.INFO, "Extractor: " + fileExtract + ", file: " + aFile.getName());
//divide into chunks and index
return fileExtract.index(aFile, context);
}
/**
* Extract strings using heuristics from the file and add to index.
*
* @param aFile file to extract strings from, divide into chunks and
* index
*
* @return true if the file was text_ingested, false otherwise
*/
private boolean extractStringsAndIndex(AbstractFile aFile) {
try {
if (context.fileIngestIsCancelled()) {
return true;
}
if (stringExtractor.index(aFile, KeywordSearchIngestModule.this.context)) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.STRINGS_INGESTED);
return true;
} else {
logger.log(Level.WARNING, "Failed to extract strings and ingest, file ''{0}'' (id: {1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
return false;
}
} catch (IngesterException ex) {
logger.log(Level.WARNING, "Failed to extract strings and ingest, file '" + aFile.getName() + "' (id: " + aFile.getId() + ").", ex); //NON-NLS
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
return false;
}
}
/**
* Check with every extractor if it supports the file with the detected
* format
*
* @param aFile file to check for
* @param detectedFormat mime-type with detected format (such as
* text/plain) or null if not detected
*
* @return true if text extraction is supported
*/
private boolean isTextExtractSupported(AbstractFile aFile, String detectedFormat) {
for (TextExtractor extractor : textExtractors) {
if (extractor.isContentTypeSpecific() == true
&& extractor.isSupported(aFile, detectedFormat)) {
return true;
}
}
return false;
}
/**
* Adds the file to the index. Detects file type, calls extractors, etc.
*
* @param aFile File to analyze
* @param indexContent False if only metadata should be text_ingested.
* True if content and metadata should be index.
*/
private void indexFile(AbstractFile aFile, boolean indexContent) {
//logger.log(Level.INFO, "Processing AbstractFile: " + abstractFile.getName());
TskData.TSK_DB_FILES_TYPE_ENUM aType = aFile.getType();
// unallocated and unused blocks can only have strings extracted from them.
if ((aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNALLOC_BLOCKS) || aType.equals(TskData.TSK_DB_FILES_TYPE_ENUM.UNUSED_BLOCKS))) {
if (context.fileIngestIsCancelled()) {
return;
}
extractStringsAndIndex(aFile);
return;
}
final long size = aFile.getSize();
//if not to index content, or a dir, or 0 content, index meta data only
if ((indexContent == false || aFile.isDir() || size == 0)) {
try {
if (context.fileIngestIsCancelled()) {
return;
}
ingester.ingest(aFile, false); //meta-data only
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
} catch (IngesterException ex) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
}
return;
}
String fileType;
try {
if (context.fileIngestIsCancelled()) {
return;
}
fileType = fileTypeDetector.getFileType(aFile);
} catch (TskCoreException ex) {
logger.log(Level.SEVERE, String.format("Could not detect format using fileTypeDetector for file: %s", aFile), ex); //NON-NLS
return;
}
// we skip archive formats that are opened by the archive module.
// @@@ We could have a check here to see if the archive module was enabled though...
if (TextExtractor.ARCHIVE_MIME_TYPES.contains(fileType)) {
try {
if (context.fileIngestIsCancelled()) {
return;
}
ingester.ingest(aFile, false); //meta-data only
putIngestStatus(jobId, aFile.getId(), IngestStatus.METADATA_INGESTED);
} catch (IngesterException ex) {
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
logger.log(Level.WARNING, "Unable to index meta-data for file: " + aFile.getId(), ex); //NON-NLS
}
return;
}
boolean wasTextAdded = false;
//extract text with one of the extractors, divide into chunks and index with Solr
try {
//logger.log(Level.INFO, "indexing: " + aFile.getName());
if (context.fileIngestIsCancelled()) {
return;
}
if (fileType.equals("application/octet-stream")) {
extractStringsAndIndex(aFile);
return;
}
if (!extractTextAndIndex(aFile, fileType)) {
logger.log(Level.WARNING, "Text extractor not found for file. Extracting strings only. File: ''{0}'' (id:{1}).", new Object[]{aFile.getName(), aFile.getId()}); //NON-NLS
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
} else {
putIngestStatus(jobId, aFile.getId(), IngestStatus.TEXT_INGESTED);
wasTextAdded = true;
}
} catch (IngesterException e) {
logger.log(Level.INFO, "Could not extract text with Tika, " + aFile.getId() + ", " //NON-NLS
+ aFile.getName(), e);
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_INDEXING);
} catch (Exception e) {
logger.log(Level.WARNING, "Error extracting text with Tika, " + aFile.getId() + ", " //NON-NLS
+ aFile.getName(), e);
putIngestStatus(jobId, aFile.getId(), IngestStatus.SKIPPED_ERROR_TEXTEXTRACT);
}
// if it wasn't supported or had an error, default to strings
if (wasTextAdded == false) {
extractStringsAndIndex(aFile);
}
}
}
}