/**
* FreeDesktopSearch - A Search Engine for your Desktop
* Copyright (C) 2013 Mirko Sertic
*
* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
package de.mirkosertic.desktopsearch;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.codec.net.URLCodec;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LegacyLongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.DrillDownQuery;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NRTCachingDirectory;
import org.apache.tika.utils.DateUtils;
import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ForkJoinTask;
class LuceneIndexHandler {
private static final Logger LOGGER = Logger.getLogger(LuceneIndexHandler.class);
private static final int NUMBER_OF_FRAGMENTS = 5;
private final IndexWriter indexWriter;
private final SearcherManager searcherManager;
private final AnalyzerCache analyzerCache;
private final Analyzer analyzer;
private final FacetsConfig facetsConfig;
private final Thread commitThread;
private final FieldType contentFieldType;
private final ExecutorPool executorPool;
private final Configuration configuration;
private final PreviewProcessor previewProcessor;
public LuceneIndexHandler(Configuration aConfiguration, AnalyzerCache aAnalyzerCache, ExecutorPool aExecutorPool, PreviewProcessor aPreviewProcessor) throws IOException {
previewProcessor = aPreviewProcessor;
configuration = aConfiguration;
analyzerCache = aAnalyzerCache;
executorPool = aExecutorPool;
contentFieldType = new FieldType();
contentFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
contentFieldType.setStored(true);
contentFieldType.setTokenized(true);
contentFieldType.setStoreTermVectorOffsets(true);
contentFieldType.setStoreTermVectorPayloads(true);
contentFieldType.setStoreTermVectorPositions(true);
contentFieldType.setStoreTermVectors(true);
analyzer = analyzerCache.getAnalyzer();
File theIndexDirectory = new File(aConfiguration.getConfigDirectory(), "index");
theIndexDirectory.mkdirs();
Directory theIndexFSDirectory = new NRTCachingDirectory(FSDirectory.open(theIndexDirectory.toPath()), 100, 100);
IndexWriterConfig theConfig = new IndexWriterConfig(analyzer);
indexWriter = new IndexWriter(theIndexFSDirectory, theConfig);
searcherManager = new SearcherManager(indexWriter, true, true, new SearcherFactory());
commitThread = new Thread("Lucene Commit Thread") {
@Override
public void run() {
while (!isInterrupted()) {
if (indexWriter.hasUncommittedChanges()) {
try {
indexWriter.commit();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
// Do nothing here
}
}
}
};
commitThread.start();
facetsConfig = new FacetsConfig();
}
public void crawlingStarts() throws IOException {
searcherManager.maybeRefreshBlocking();
}
public void addToIndex(String aLocationId, Content aContent) throws IOException {
Document theDocument = new Document();
SupportedLanguage theLanguage = aContent.getLanguage();
theDocument.add(new StringField(IndexFields.UNIQUEID, UUID.randomUUID().toString(), Field.Store.YES));
theDocument.add(new StringField(IndexFields.FILENAME, aContent.getFileName(), Field.Store.YES));
theDocument.add(new SortedSetDocValuesFacetField(IndexFields.LANGUAGEFACET, theLanguage.name()));
theDocument.add(new TextField(IndexFields.LANGUAGESTORED, theLanguage.name(), Field.Store.YES));
theDocument.add(new TextField(IndexFields.CONTENTMD5, DigestUtils.md5Hex(aContent.getFileContent()), Field.Store.YES));
StringBuilder theContentAsString = new StringBuilder(aContent.getFileContent());
aContent.getMetadata().forEach(theEntry -> {
if (!StringUtils.isEmpty(theEntry.key)) {
Object theValue = theEntry.value;
if (theValue instanceof String) {
facetsConfig.setMultiValued(theEntry.key, true);
String theStringValue = (String) theValue;
theContentAsString.append(" ").append(theStringValue);
if (!StringUtils.isEmpty(theStringValue)) {
theDocument.add(new SortedSetDocValuesFacetField(theEntry.key, theStringValue));
}
}
if (theValue instanceof Date) {
facetsConfig.setHierarchical(theEntry.key, true);
Date theDateValue = (Date) theValue;
Calendar theCalendar = GregorianCalendar.getInstance(DateUtils.UTC, Locale.US);
theCalendar.setTime(theDateValue);
// Full-Path
{
String thePathInfo = String.format(
"%04d/%02d/%02d",
theCalendar.get(Calendar.YEAR),
theCalendar.get(Calendar.MONTH) + 1,
theCalendar.get(Calendar.DAY_OF_MONTH));
theContentAsString.append(" ").append(thePathInfo);
facetsConfig.setMultiValued(theEntry.key+"-year-month-day", true);
theDocument.add(new SortedSetDocValuesFacetField(theEntry.key+"-year-month-day", thePathInfo));
}
// Year
{
String thePathInfo = String.format(
"%04d",
theCalendar.get(Calendar.YEAR));
theContentAsString.append(" ").append(thePathInfo);
facetsConfig.setMultiValued(theEntry.key+"-year", true);
theDocument.add(new SortedSetDocValuesFacetField(theEntry.key+"-year", thePathInfo));
}
// Year-month
{
String thePathInfo = String.format(
"%04d/%02d",
theCalendar.get(Calendar.YEAR),
theCalendar.get(Calendar.MONTH) + 1);
theContentAsString.append(" ").append(thePathInfo);
facetsConfig.setMultiValued(theEntry.key+"-year-month", true);
theDocument.add(new SortedSetDocValuesFacetField(theEntry.key+"-year-month", thePathInfo));
}
}
}
});
if (analyzerCache.supportsLanguage(theLanguage)) {
LOGGER.info("Language and analyzer " + theLanguage+" detected for " + aContent.getFileName()+", using the corresponding language index field");
String theFieldName = analyzerCache.getFieldNameFor(theLanguage);
theDocument.add(new Field(theFieldName, theContentAsString.toString(), contentFieldType));
} else {
LOGGER.info("No matching language and analyzer detected for " + theLanguage+" and " + aContent.getFileName()+", using the default index field and analyzer");
theDocument.add(new Field(IndexFields.CONTENT, theContentAsString.toString(), contentFieldType));
}
theDocument.add(new Field(IndexFields.CONTENT_NOT_STEMMED, theContentAsString.toString(), contentFieldType));
theDocument.add(new TextField(IndexFields.CONTENTMD5, DigestUtils.md5Hex(aContent.getFileContent()), Field.Store.YES));
theDocument.add(new StringField(IndexFields.LOCATIONID, aLocationId, Field.Store.YES));
theDocument.add(new LegacyLongField(IndexFields.FILESIZE, aContent.getFileSize(), Field.Store.YES));
theDocument.add(new LegacyLongField(IndexFields.LASTMODIFIED, aContent.getLastModified(), Field.Store.YES));
// Update the document in our search index
indexWriter.updateDocument(new Term(IndexFields.FILENAME, aContent.getFileName()), facetsConfig.build(theDocument));
}
public void removeFromIndex(String aFileName) throws IOException {
indexWriter.deleteDocuments(new Term(IndexFields.FILENAME, aFileName));
}
public void shutdown() {
commitThread.interrupt();
try {
indexWriter.close();
} catch (Exception e) {
LOGGER.error("Error while closing IndexWriter", e);
}
}
public boolean checkIfExists(String aFilename) throws IOException {
IndexSearcher theSearcher = searcherManager.acquire();
try {
Query theQuery = new TermQuery(new Term(IndexFields.FILENAME, aFilename));
TopDocs theDocs = theSearcher.search(theQuery, 100, Sort.INDEXORDER);
return theDocs.scoreDocs.length != 0;
} finally {
searcherManager.release(theSearcher);
}
}
public UpdateCheckResult checkIfModified(String aFilename, long aLastModified) throws IOException {
IndexSearcher theSearcher = searcherManager.acquire();
try {
Query theQuery = new TermQuery(new Term(IndexFields.FILENAME, aFilename));
TopDocs theDocs = theSearcher.search(theQuery, 100, Sort.INDEXORDER);
if (theDocs.scoreDocs.length == 0) {
return UpdateCheckResult.UPDATED;
}
if (theDocs.scoreDocs.length > 1) {
// Multiple documents in index, we need to clean up
return UpdateCheckResult.UPDATED;
}
ScoreDoc theFirstScore = theDocs.scoreDocs[0];
Document theDocument = theSearcher.doc(theFirstScore.doc);
long theStoredLastModified = theDocument.getField(IndexFields.LASTMODIFIED).numericValue().longValue();
if (theStoredLastModified != aLastModified) {
return UpdateCheckResult.UPDATED;
}
return UpdateCheckResult.UNMODIFIED;
} finally {
searcherManager.release(theSearcher);
}
}
private String encode(String aValue) {
URLCodec theURLCodec = new URLCodec();
try {
return theURLCodec.encode(aValue);
} catch (EncoderException e) {
return null;
}
}
private BooleanQuery computeBooleanQueryFor(String aQueryString) throws IOException {
QueryParser theParser = new QueryParser(analyzer);
BooleanQuery.Builder theBooleanQuery = new BooleanQuery.Builder();
theBooleanQuery.setMinimumNumberShouldMatch(1);
for (String theFieldName : analyzerCache.getAllFieldNames()) {
Query theSingle = theParser.parse(aQueryString, theFieldName);
theBooleanQuery.add(theSingle, BooleanClause.Occur.SHOULD);
}
return theBooleanQuery.build();
}
public QueryResult performQuery(String aQueryString, String aBacklink, String aBasePath, Configuration aConfiguration, Map<String, String> aDrilldownFields) throws IOException {
searcherManager.maybeRefreshBlocking();
IndexSearcher theSearcher = searcherManager.acquire();
SortedSetDocValuesReaderState theSortedSetState = new DefaultSortedSetDocValuesReaderState(theSearcher.getIndexReader());
List<QueryResultDocument> theResultDocuments = new ArrayList<>();
long theStartTime = System.currentTimeMillis();
LOGGER.info("Querying for "+aQueryString);
DateFormat theDateFormat = new SimpleDateFormat("dd.MMMM.yyyy", Locale.ENGLISH);
try {
List<FacetDimension> theDimensions = new ArrayList<>();
// Search only if a search query is given
if (!StringUtils.isEmpty(aQueryString)) {
Query theQuery = computeBooleanQueryFor(aQueryString);
LOGGER.info(" query is " + theQuery);
theQuery = theQuery.rewrite(theSearcher.getIndexReader());
LOGGER.info(" rewritten query is " + theQuery);
DrillDownQuery theDrilldownQuery = new DrillDownQuery(facetsConfig, theQuery);
aDrilldownFields.entrySet().stream().forEach(aEntry -> {
LOGGER.info(" with Drilldown "+aEntry.getKey()+" for "+aEntry.getValue());
theDrilldownQuery.add(aEntry.getKey(), aEntry.getValue());
});
FacetsCollector theFacetCollector = new FacetsCollector();
TopDocs theDocs = FacetsCollector.search(theSearcher, theDrilldownQuery, aConfiguration.getNumberOfSearchResults(), Sort.RELEVANCE, true, true, theFacetCollector);
SortedSetDocValuesFacetCounts theFacetCounts = new SortedSetDocValuesFacetCounts(theSortedSetState, theFacetCollector);
List<Facet> theAuthorFacets = new ArrayList<>();
List<Facet> theFileTypesFacets = new ArrayList<>();
List<Facet> theLastModifiedYearFacet = new ArrayList<>();
List<Facet> theLanguageFacet = new ArrayList<>();
LOGGER.info("Found " + theDocs.scoreDocs.length + " documents");
// We need this cache to detect duplicate documents while searching for similarities
Set<Integer> theUniqueDocumentsFound = new HashSet<>();
Map<String, QueryResultDocument> theDocumentsByHash = new HashMap<>();
for (int i = 0; i < theDocs.scoreDocs.length; i++) {
int theDocumentID = theDocs.scoreDocs[i].doc;
theUniqueDocumentsFound.add(theDocumentID);
Document theDocument = theSearcher.doc(theDocumentID);
String theUniqueID = theDocument.getField(IndexFields.UNIQUEID).stringValue();
String theFoundFileName = theDocument.getField(IndexFields.FILENAME).stringValue();
String theHash = theDocument.getField(IndexFields.CONTENTMD5).stringValue();
QueryResultDocument theExistingDocument = theDocumentsByHash.get(theHash);
if (theExistingDocument != null) {
theExistingDocument.addFileName(theFoundFileName);
} else {
Date theLastModified = new Date(theDocument.getField(IndexFields.LASTMODIFIED).numericValue().longValue());
SupportedLanguage theLanguage = SupportedLanguage.valueOf(theDocument.getField(IndexFields.LANGUAGESTORED).stringValue());
String theFieldName;
if (analyzerCache.supportsLanguage(theLanguage)) {
theFieldName = analyzerCache.getFieldNameFor(theLanguage);
} else {
theFieldName = IndexFields.CONTENT;
}
String theOriginalContent = theDocument.getField(theFieldName).stringValue();
final Query theFinalQuery = theQuery;
ForkJoinTask<String> theHighligherResult = executorPool.submit(() -> {
StringBuilder theResult = new StringBuilder(theDateFormat.format(theLastModified));
theResult.append(" - ");
Highlighter theHighlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(theFinalQuery));
for (String theFragment : theHighlighter.getBestFragments(analyzer, theFieldName, theOriginalContent, NUMBER_OF_FRAGMENTS)) {
if (theResult.length() > 0) {
theResult = theResult.append("...");
}
theResult = theResult.append(theFragment);
}
return theResult.toString();
});
int theNormalizedScore = (int)(theDocs.scoreDocs[i].score / theDocs.getMaxScore() * 5);
File theFileOnDisk = new File(theFoundFileName);
if (theFileOnDisk.exists()) {
boolean thePreviewAvailable = previewProcessor.previewAvailableFor(theFileOnDisk);
theExistingDocument = new QueryResultDocument(theDocumentID, theFoundFileName, theHighligherResult,
Long.parseLong(theDocument.getField(IndexFields.LASTMODIFIED).stringValue()),
theNormalizedScore, theUniqueID, thePreviewAvailable);
theDocumentsByHash.put(theHash, theExistingDocument);
theResultDocuments.add(theExistingDocument);
}
}
}
if (aConfiguration.isShowSimilarDocuments()) {
MoreLikeThis theMoreLikeThis = new MoreLikeThis(theSearcher.getIndexReader());
theMoreLikeThis.setAnalyzer(analyzer);
theMoreLikeThis.setMinTermFreq(1);
theMoreLikeThis.setMinDocFreq(1);
theMoreLikeThis.setFieldNames(analyzerCache.getAllFieldNames());
for (QueryResultDocument theDocument : theResultDocuments) {
Query theMoreLikeThisQuery = theMoreLikeThis.like(theDocument.getDocumentID());
TopDocs theMoreLikeThisTopDocs = theSearcher.search(theMoreLikeThisQuery, 5);
for (ScoreDoc theMoreLikeThisScoreDoc : theMoreLikeThisTopDocs.scoreDocs) {
int theSimilarDocument = theMoreLikeThisScoreDoc.doc;
if (theUniqueDocumentsFound.add(theSimilarDocument)) {
Document theMoreLikeThisDocument = theSearcher.doc(theSimilarDocument);
String theFilename = theMoreLikeThisDocument.getField(IndexFields.FILENAME).stringValue();
theDocument.addSimilarFile(theFilename);
}
}
}
}
LOGGER.info("Got Dimensions");
for (FacetResult theResult : theFacetCounts.getAllDims(20000)) {
String theDimension = theResult.dim;
if ("author".equals(theDimension)) {
for (LabelAndValue theLabelAndValue : theResult.labelValues) {
if (!StringUtils.isEmpty(theLabelAndValue.label)) {
theAuthorFacets.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(),
aBasePath + "/" + encode(
FacetSearchUtils.encode(theDimension, theLabelAndValue.label))));
}
}
}
if ("extension".equals(theDimension)) {
for (LabelAndValue theLabelAndValue : theResult.labelValues) {
if (!StringUtils.isEmpty(theLabelAndValue.label)) {
theFileTypesFacets.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(),
aBasePath + "/" + encode(
FacetSearchUtils.encode(theDimension, theLabelAndValue.label))));
}
}
}
if ("last-modified-year".equals(theDimension)) {
for (LabelAndValue theLabelAndValue : theResult.labelValues) {
if (!StringUtils.isEmpty(theLabelAndValue.label)) {
theLastModifiedYearFacet.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(),
aBasePath + "/" + encode(
FacetSearchUtils.encode(theDimension, theLabelAndValue.label))));
}
}
}
if (IndexFields.LANGUAGEFACET.equals(theDimension)) {
for (LabelAndValue theLabelAndValue : theResult.labelValues) {
if (!StringUtils.isEmpty(theLabelAndValue.label)) {
Locale theLocale = new Locale(theLabelAndValue.label);
theLanguageFacet.add(new Facet(theLocale.getDisplayLanguage(Locale.ENGLISH),
theLabelAndValue.value.intValue(), aBasePath + "/" + encode(
FacetSearchUtils.encode(theDimension, theLabelAndValue.label))));
}
}
}
LOGGER.info(" "+theDimension);
}
if (!theAuthorFacets.isEmpty()) {
theDimensions.add(new FacetDimension("Author", theAuthorFacets));
}
if (!theLastModifiedYearFacet.isEmpty()) {
theDimensions.add(new FacetDimension("Last modified", theLastModifiedYearFacet));
}
if (!theFileTypesFacets.isEmpty()) {
theDimensions.add(new FacetDimension("File types", theFileTypesFacets));
}
if (!theLanguageFacet.isEmpty()) {
theDimensions.add(new FacetDimension("Language", theLanguageFacet));
}
// Wait for all Tasks to complete for the search result highlighter
ForkJoinTask.helpQuiesce();
}
long theDuration = System.currentTimeMillis() - theStartTime;
LOGGER.info("Total amount of time : "+theDuration+"ms");
return new QueryResult(System.currentTimeMillis() - theStartTime, theResultDocuments, theDimensions, theSearcher.getIndexReader().numDocs(), aBacklink);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
searcherManager.release(theSearcher);
}
}
public Suggestion[] findSuggestionTermsFor(String aTerm) throws IOException {
searcherManager.maybeRefreshBlocking();
IndexSearcher theSearcher = searcherManager.acquire();
try {
SearchPhraseSuggester theSuggester = new SearchPhraseSuggester(theSearcher, analyzer, configuration);
List<Suggestion> theResult = theSuggester.suggestSearchPhrase(IndexFields.CONTENT_NOT_STEMMED, aTerm);
return theResult.toArray(new Suggestion[theResult.size()]);
} finally {
searcherManager.release(theSearcher);
}
}
public File getFileOnDiskForDocument(String aUniqueID) throws IOException {
searcherManager.maybeRefreshBlocking();
IndexSearcher theSearcher = searcherManager.acquire();
try {
TermQuery theTermQuery = new TermQuery(new Term(IndexFields.UNIQUEID, aUniqueID));
TopDocs theTopDocs = theSearcher.search(theTermQuery, 1, Sort.INDEXORDER);
if (theTopDocs.totalHits == 1) {
Document theDocument = theSearcher.doc(theTopDocs.scoreDocs[0].doc);
if (theDocument != null) {
return new File(theDocument.get(IndexFields.FILENAME));
}
}
return null;
} finally {
searcherManager.release(theSearcher);
}
}
public void cleanupDeadContent() throws IOException {
searcherManager.maybeRefreshBlocking();
IndexSearcher theSearcher = searcherManager.acquire();
try {
IndexReader theReader = theSearcher.getIndexReader();
for (int i = 0; i < theReader.maxDoc(); i++) {
Document theDocument = theReader.document(i);
File theFile = new File(theDocument.getField(IndexFields.FILENAME).stringValue());
if (!theFile.exists()) {
LOGGER.info("Removing file "+theFile+" from index as it does not exist anymore.");
String theUniqueID = theDocument.getField(IndexFields.UNIQUEID).stringValue();
indexWriter.deleteDocuments(new Term(IndexFields.UNIQUEID, theUniqueID));
}
}
} finally {
searcherManager.release(theSearcher);
}
}
}