/*
* Autopsy Forensic Browser
*
* Copyright 2011-2015 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.EscapeUtil;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.Version;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardArtifact.ARTIFACT_TYPE;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.BlackboardAttribute.ATTRIBUTE_TYPE;
import org.sleuthkit.datamodel.SleuthkitCase;
import org.sleuthkit.datamodel.TskException;
/**
* Performs a normal string (i.e. non-regexp) query to SOLR/Lucene. By default,
* matches in all fields.
*/
class LuceneQuery implements KeywordSearchQuery {
private static final Logger logger = Logger.getLogger(LuceneQuery.class.getName());
private final String keywordString; //original unescaped query
private String keywordStringEscaped;
private boolean isEscaped;
private Keyword keyword = null;
private KeywordList keywordList = null;
private final List<KeywordQueryFilter> filters = new ArrayList<>();
private String field = null;
private static final int MAX_RESULTS = 20000;
static final int SNIPPET_LENGTH = 50;
//can use different highlight schema fields for regex and literal search
static final String HIGHLIGHT_FIELD_LITERAL = Server.Schema.TEXT.toString();
static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.TEXT.toString();
//TODO use content_ws stored="true" in solr schema for perfect highlight hits
//static final String HIGHLIGHT_FIELD_REGEX = Server.Schema.CONTENT_WS.toString()
private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);
/**
* Constructor with query to process.
*
* @param keyword
*/
public LuceneQuery(KeywordList keywordList, Keyword keyword) {
this.keywordList = keywordList;
this.keyword = keyword;
// @@@ BC: Long-term, we should try to get rid of this string and use only the
// keyword object. Refactoring did not make its way through this yet.
this.keywordString = keyword.getSearchTerm();
this.keywordStringEscaped = this.keywordString;
}
@Override
public void addFilter(KeywordQueryFilter filter) {
this.filters.add(filter);
}
@Override
public void setField(String field) {
this.field = field;
}
@Override
public void setSubstringQuery() {
// Note that this is not a full substring search. Normally substring
// searches will be done with TermComponentQuery objects instead.
keywordStringEscaped = keywordStringEscaped + "*";
}
@Override
public void escape() {
keywordStringEscaped = KeywordSearchUtil.escapeLuceneQuery(keywordString);
isEscaped = true;
}
@Override
public boolean isEscaped() {
return isEscaped;
}
@Override
public boolean isLiteral() {
return true;
}
@Override
public String getEscapedQueryString() {
return this.keywordStringEscaped;
}
@Override
public String getQueryString() {
return this.keywordString;
}
@Override
public QueryResults performQuery() throws KeywordSearchModuleException, NoOpenCoreException {
QueryResults results = new QueryResults(this, keywordList);
//in case of single term literal query there is only 1 term
boolean showSnippets = KeywordSearchSettings.getShowSnippets();
results.addResult(new Keyword(keywordString, true), performLuceneQuery(showSnippets));
return results;
}
@Override
public boolean validate() {
return keywordString != null && !keywordString.equals("");
}
@Override
public KeywordCachedArtifact writeSingleFileHitsToBlackBoard(String termHit, KeywordHit hit, String snippet, String listName) {
final String MODULE_NAME = KeywordSearchModuleFactory.getModuleName();
Collection<BlackboardAttribute> attributes = new ArrayList<>();
BlackboardArtifact bba;
KeywordCachedArtifact writeResult;
try {
bba = hit.getContent().newArtifact(ARTIFACT_TYPE.TSK_KEYWORD_HIT);
writeResult = new KeywordCachedArtifact(bba);
} catch (Exception e) {
logger.log(Level.WARNING, "Error adding bb artifact for keyword hit", e); //NON-NLS
return null;
}
if (snippet != null) {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_PREVIEW, MODULE_NAME, snippet));
}
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD, MODULE_NAME, termHit));
if ((listName != null) && (listName.equals("") == false)) {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_SET_NAME, MODULE_NAME, listName));
}
//bogus - workaround the dir tree table issue
//attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_KEYWORD_REGEXP.getTypeID(), MODULE_NAME, "", ""));
//selector
if (keyword != null) {
BlackboardAttribute.ATTRIBUTE_TYPE selType = keyword.getArtifactAttributeType();
if (selType != null) {
attributes.add(new BlackboardAttribute(selType, MODULE_NAME, termHit));
}
}
if (hit.isArtifactHit()) {
attributes.add(new BlackboardAttribute(ATTRIBUTE_TYPE.TSK_ASSOCIATED_ARTIFACT, MODULE_NAME, hit.getArtifact().getArtifactID()));
}
try {
bba.addAttributes(attributes); //write out to bb
writeResult.add(attributes);
return writeResult;
} catch (TskException e) {
logger.log(Level.WARNING, "Error adding bb attributes to artifact", e); //NON-NLS
}
return null;
}
/**
* Perform the query and return results of unique files.
*
* @param snippets True if results should have a snippet
*
* @return list of ContentHit objects. One per file with hit (ignores
* multiple hits of the word in the same doc)
*
* @throws NoOpenCoreException
*/
private List<KeywordHit> performLuceneQuery(boolean snippets) throws KeywordSearchModuleException, NoOpenCoreException {
List<KeywordHit> matches = new ArrayList<>();
boolean allMatchesFetched = false;
final Server solrServer = KeywordSearch.getServer();
SolrQuery q = createAndConfigureSolrQuery(snippets);
QueryResponse response;
SolrDocumentList resultList;
Map<String, Map<String, List<String>>> highlightResponse;
response = solrServer.query(q, METHOD.POST);
resultList = response.getResults();
// objectId_chunk -> "text" -> List of previews
highlightResponse = response.getHighlighting();
// cycle through results in sets of MAX_RESULTS
for (int start = 0; !allMatchesFetched; start = start + MAX_RESULTS) {
q.setStart(start);
allMatchesFetched = start + MAX_RESULTS >= resultList.getNumFound();
SleuthkitCase sleuthkitCase;
try {
sleuthkitCase = Case.getCurrentCase().getSleuthkitCase();
} catch (IllegalStateException ex) {
//no case open, must be just closed
return matches;
}
for (SolrDocument resultDoc : resultList) {
KeywordHit contentHit;
try {
contentHit = createKeywordtHit(resultDoc, highlightResponse, sleuthkitCase);
} catch (TskException ex) {
return matches;
}
matches.add(contentHit);
}
}
return matches;
}
/**
* Create the query object for the stored keyword
*
* @param snippets True if query should request snippets
*
* @return
*/
private SolrQuery createAndConfigureSolrQuery(boolean snippets) {
SolrQuery q = new SolrQuery();
q.setShowDebugInfo(DEBUG); //debug
//set query, force quotes/grouping around all literal queries
final String groupedQuery = KeywordSearchUtil.quoteQuery(keywordStringEscaped);
String theQueryStr = groupedQuery;
if (field != null) {
//use the optional field
StringBuilder sb = new StringBuilder();
sb.append(field).append(":").append(groupedQuery);
theQueryStr = sb.toString();
}
q.setQuery(theQueryStr);
q.setRows(MAX_RESULTS);
q.setFields(Server.Schema.ID.toString());
q.addSort(Server.Schema.ID.toString(), SolrQuery.ORDER.asc);
for (KeywordQueryFilter filter : filters) {
q.addFilterQuery(filter.toString());
}
if (snippets) {
q.addHighlightField(Server.Schema.TEXT.toString());
//q.setHighlightSimplePre("«"); //original highlighter only
//q.setHighlightSimplePost("»"); //original highlighter only
q.setHighlightSnippets(1);
q.setHighlightFragsize(SNIPPET_LENGTH);
//tune the highlighter
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
q.setParam("hl.tag.pre", "«"); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.tag.post", "«"); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only NON-NLS
//Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.
q.setParam("hl.fragCharSize", Integer.toString(theQueryStr.length())); //makes sense for FastVectorHighlighter only NON-NLS
//docs says makes sense for the original Highlighter only, but not really
//analyze all content SLOW! consider lowering
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
}
return q;
}
private KeywordHit createKeywordtHit(SolrDocument solrDoc, Map<String, Map<String, List<String>>> highlightResponse, SleuthkitCase caseDb) throws TskException {
/**
* Get the first snippet from the document if keyword search is
* configured to use snippets.
*/
final String docId = solrDoc.getFieldValue(Server.Schema.ID.toString()).toString();
String snippet = "";
if (KeywordSearchSettings.getShowSnippets()) {
List<String> snippetList = highlightResponse.get(docId).get(Server.Schema.TEXT.toString());
// list is null if there wasn't a snippet
if (snippetList != null) {
snippet = EscapeUtil.unEscapeHtml(snippetList.get(0)).trim();
}
}
return new KeywordHit(docId, snippet);
}
/**
* return snippet preview context
*
* @param query the keyword query for text to highlight. Lucene
* special cahrs should already be escaped.
* @param solrObjectId The Solr object id associated with the file or
* artifact
* @param isRegex whether the query is a regular expression (different
* Solr fields are then used to generate the preview)
* @param group whether the query should look for all terms grouped
* together in the query order, or not
*
* @return
*/
public static String querySnippet(String query, long solrObjectId, boolean isRegex, boolean group) throws NoOpenCoreException {
return querySnippet(query, solrObjectId, 0, isRegex, group);
}
/**
* return snippet preview context
*
* @param query the keyword query for text to highlight. Lucene
* special cahrs should already be escaped.
* @param solrObjectId Solr object id associated with the hit
* @param chunkID chunk id associated with the content hit, or 0 if no
* chunks
* @param isRegex whether the query is a regular expression (different
* Solr fields are then used to generate the preview)
* @param group whether the query should look for all terms grouped
* together in the query order, or not
*
* @return
*/
public static String querySnippet(String query, long solrObjectId, int chunkID, boolean isRegex, boolean group) throws NoOpenCoreException {
Server solrServer = KeywordSearch.getServer();
String highlightField;
if (isRegex) {
highlightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
} else {
highlightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
}
SolrQuery q = new SolrQuery();
String queryStr;
if (isRegex) {
StringBuilder sb = new StringBuilder();
sb.append(highlightField).append(":");
if (group) {
sb.append("\"");
}
sb.append(query);
if (group) {
sb.append("\"");
}
queryStr = sb.toString();
} else {
//simplify query/escaping and use default field
//always force grouping/quotes
queryStr = KeywordSearchUtil.quoteQuery(query);
}
q.setQuery(queryStr);
String contentIDStr;
if (chunkID == 0) {
contentIDStr = Long.toString(solrObjectId);
} else {
contentIDStr = Server.getChunkIdString(solrObjectId, chunkID);
}
String idQuery = Server.Schema.ID.toString() + ":" + KeywordSearchUtil.escapeLuceneQuery(contentIDStr);
q.setShowDebugInfo(DEBUG); //debug
q.addFilterQuery(idQuery);
q.addHighlightField(highlightField);
//q.setHighlightSimplePre("«"); //original highlighter only
//q.setHighlightSimplePost("»"); //original highlighter only
q.setHighlightSnippets(1);
q.setHighlightFragsize(SNIPPET_LENGTH);
//tune the highlighter
q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
q.setParam("hl.tag.pre", "«"); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.tag.post", "«"); //makes sense for FastVectorHighlighter only NON-NLS
q.setParam("hl.fragListBuilder", "simple"); //makes sense for FastVectorHighlighter only NON-NLS
//Solr bug if fragCharSize is smaller than Query string, StringIndexOutOfBoundsException is thrown.
q.setParam("hl.fragCharSize", Integer.toString(queryStr.length())); //makes sense for FastVectorHighlighter only NON-NLS
//docs says makes sense for the original Highlighter only, but not really
//analyze all content SLOW! consider lowering
q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS
try {
QueryResponse response = solrServer.query(q, METHOD.POST);
Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();
Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIDStr);
if (responseHighlightID == null) {
return "";
}
List<String> contentHighlights = responseHighlightID.get(highlightField);
if (contentHighlights == null) {
return "";
} else {
// extracted content is HTML-escaped, but snippet goes in a plain text field
return EscapeUtil.unEscapeHtml(contentHighlights.get(0)).trim();
}
} catch (NoOpenCoreException ex) {
logger.log(Level.WARNING, "Error executing Lucene Solr Query: " + query, ex); //NON-NLS
throw ex;
} catch (KeywordSearchModuleException ex) {
logger.log(Level.WARNING, "Error executing Lucene Solr Query: " + query, ex); //NON-NLS
return "";
}
}
@Override
public KeywordList getKeywordList() {
return keywordList;
}
/**
* Compares SolrDocuments based on their ID's. Two SolrDocuments with
* different chunk numbers are considered equal.
*/
private class SolrDocumentComparatorIgnoresChunkId implements Comparator<SolrDocument> {
@Override
public int compare(SolrDocument left, SolrDocument right) {
// ID is in the form of ObjectId_Chunk
final String idName = Server.Schema.ID.toString();
// get object id of left doc
String leftID = left.getFieldValue(idName).toString();
int index = leftID.indexOf(Server.CHUNK_ID_SEPARATOR);
if (index != -1) {
leftID = leftID.substring(0, index);
}
// get object id of right doc
String rightID = right.getFieldValue(idName).toString();
index = rightID.indexOf(Server.CHUNK_ID_SEPARATOR);
if (index != -1) {
rightID = rightID.substring(0, index);
}
Long leftLong = new Long(leftID);
Long rightLong = new Long(rightID);
return leftLong.compareTo(rightLong);
}
}
}