/* * Copyright (c) 2009 Andrejs Jermakovics. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Andrejs Jermakovics - initial implementation */ package it.unibz.instasearch.indexing; import it.unibz.instasearch.indexing.StorageIndexer.IndexChangeListener; import it.unibz.instasearch.indexing.querying.CSVExpander; import it.unibz.instasearch.indexing.querying.CurrentProjectSetter; import it.unibz.instasearch.indexing.querying.FieldAliasConverter; import it.unibz.instasearch.indexing.querying.FileNameSearcher; import it.unibz.instasearch.indexing.querying.FilterSetter; import it.unibz.instasearch.indexing.querying.FolderSearcher; import it.unibz.instasearch.indexing.querying.LastTermQueryPrefixer; import it.unibz.instasearch.indexing.querying.LowercaseConverter; import it.unibz.instasearch.indexing.querying.ModifiedTimeConverter; import it.unibz.instasearch.indexing.querying.QueryFuzzifier; import it.unibz.instasearch.indexing.querying.UppercaseNameExpander; import it.unibz.instasearch.indexing.querying.VisitableQuery; import it.unibz.instasearch.indexing.querying.WorkingSetExpander; import it.unibz.instasearch.prefs.PreferenceConstants; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser.Operator; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery.TooManyClauses; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.QueryTermExtractor; import org.apache.lucene.search.highlight.WeightedTerm; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; import org.eclipse.jface.util.IPropertyChangeListener; import org.eclipse.jface.util.PropertyChangeEvent; /** * Searcher for searching the index using SearchQuery */ @SuppressWarnings("deprecation") public class Searcher implements IPropertyChangeListener, IndexChangeListener { /** @see QueryParser#setPhraseSlop(int) */ private static final int DEFAULT_PHRASE_SLOP = 0; /** Minimum length of query in characters */ public static final int MIN_QUERY_LENGTH = 2; /** Minimum number of characters to be considered a word in indexing */ public static final int MIN_WORD_LENGTH = 1; /** Character that identifies the current project in search query */ public static final String CURRENT_PROJECT_CHAR = "."; private static final Version LUCENE_VERSION = Version.LUCENE_29; private IndexSearcher indexSearcher; public static final LengthNormSimilarity SIMILARITY = new LengthNormSimilarity(); //TODO: share with WorkspaceIndexer private static final QueryAnalyzer queryAnalyzer = new QueryAnalyzer(MIN_WORD_LENGTH); // Query visitors that modify the search query private CSVExpander csvExpander = new CSVExpander(); private WorkingSetExpander workingSetExpander = new WorkingSetExpander(); private UppercaseNameExpander uppercaseNameExpander = new UppercaseNameExpander(); private FieldAliasConverter fieldAliasConverter = new FieldAliasConverter(); private ModifiedTimeConverter modifiedTimeConverter = new ModifiedTimeConverter(); private LowercaseConverter lowercaseConverter = new LowercaseConverter(); private FolderSearcher folderSearcher = new FolderSearcher(); private FileNameSearcher fileNameSearcher = new FileNameSearcher(); private CurrentProjectSetter currentProjectSetter = new CurrentProjectSetter(); private QueryFuzzifier queryFuzzifier = new QueryFuzzifier(); private LastTermQueryPrefixer lastTermQueryPrefixer = new LastTermQueryPrefixer(MIN_QUERY_LENGTH+1); private FilterSetter filterSetter = new FilterSetter(); private boolean showMatchCounts = true; private boolean fuzzySearchAuto = true; private SearcherConfig config; public interface SearcherConfig { boolean getBoolPref(String pref); void log(Exception e); Directory getIndexDir() throws IOException; } /** * Searcher */ public Searcher(SearcherConfig config) { this.config = config; initPrefs(); } protected Directory getIndexDir() throws IOException { return config.getIndexDir(); } private SearchResult searchIndex(SearchQuery searchQuery) throws Exception { IndexSearcher indexSearcher = getIndexSearcher(); IndexReader reader = indexSearcher.getIndexReader(); boolean exact = searchQuery.isExact(); Query query = null; try { query = parseSearchQuery(searchQuery, reader, exact, true); } catch(TooManyClauses e) { // too many, try without prefix search query = parseSearchQuery(searchQuery, reader, exact, false); } catch(ParseException e) { // remove special query characters String newSearchString = searchQuery.getSearchString().replaceAll("[\\(\\)\"\\[\\]'\\{\\}]", " "); try { searchQuery.setSearchString(newSearchString); query = parseSearchQuery(searchQuery, reader, exact, true); } catch(ParseException ignored) { // can have error while typing query, just ignore //debug(newSearchString, " - ", ignored.getMessage()); return null; } } SearchResult searchResut = collectSearchResults(searchQuery, indexSearcher, reader, query); return searchResut; } private SearchResult collectSearchResults(SearchQuery searchQuery, IndexSearcher indexSearcher, IndexReader reader, Query query) throws IOException { int maxResults = reader.numDocs(); // all documents if( searchQuery.isLimited() ) maxResults = searchQuery.getMaxResults(); Map<String, Float> searchTerms = extractTerms(query); TopDocCollector collector = new TopDocCollector( maxResults ); if( searchQuery.isCanceled() ) return null; indexSearcher.search(query, collector); // do the actual search if( collector.getTotalHits() == 0 ) return null; ScoreDoc[] hits = collector.topDocs().scoreDocs; ArrayList<SearchResultDoc> resultDocs = new ArrayList<SearchResultDoc>(hits.length); for (int i = 0; i < hits.length && !searchQuery.isCanceled(); i++) { int docId = hits[i].doc; float score = hits[i].score; Document doc = reader.document(docId); SearchResultDoc resultDoc = new SearchResultDoc(getIndexDir(), doc, docId, score); if( showMatchCounts ) resultDoc.computeMatchCount(reader, searchTerms.keySet()); resultDocs.add(resultDoc); } return new SearchResult(searchQuery, resultDocs, searchTerms); } /** * Performs a search using the search query * @param searchQuery * * @return SearchResult or null if nothing is found * @throws Exception */ public SearchResult search(SearchQuery searchQuery) throws Exception { String searchString = searchQuery.getSearchString(); if( searchString == null || searchString.length() < MIN_QUERY_LENGTH ) return null; SearchResult result = null; if( searchQuery.isFuzzy() ) { searchQuery.setExact(false); result = searchIndex(searchQuery); // search fuzzy } else { // 1. search exact text if( searchQuery.isExact() ) result = searchIndex(searchQuery); // 2. split search text into tokens and search (non-exact) if( (result==null || result.isEmpty()) && !searchQuery.isCanceled() ) { searchQuery.setExact(false); result = searchIndex(searchQuery); } // 3. search wildcarded and fuzzy matches if( (result==null || result.isEmpty()) && fuzzySearchAuto && !searchQuery.isCanceled() ){ // if fuzzy search enabled searchQuery.setFuzzy(true); searchQuery.setExact(false); result = searchIndex(searchQuery); } } return result; } /** * @return the indexSearcher * @throws IOException */ private IndexSearcher getIndexSearcher() throws IOException { if( indexSearcher == null ) { indexSearcher = new IndexSearcher( getIndexDir(), true ); indexSearcher.setSimilarity( SIMILARITY ); } // indexSearcher.getIndexReader().isCurrent() return indexSearcher; } /** * Get all terms that start with prefixText * @param prefixText * @param prefixField * * @return proposed terms * @throws IOException */ public List<String> getProposals(String prefixText, Field prefixField) throws IOException { ArrayList<String> proposals = new ArrayList<String>(); IndexReader reader = getIndexSearcher().getIndexReader(); Term prefix = prefixField.createTerm(prefixText); TermEnum enumerator = reader.terms(prefix); prefixText = prefixText.toLowerCase(Locale.ENGLISH); try { do { Term term = enumerator.term(); if (term != null && term.text().toLowerCase(Locale.ENGLISH).startsWith(prefixText) && term.field().equalsIgnoreCase(prefixField.toString())) { proposals.add(term.text()); } else { break; } } while (enumerator.next()); } finally { enumerator.close(); } return proposals; } /** * Get names of all projects that are in the index * * @return list of indexed projects * @throws IOException */ public List<String> getIndexProjects() throws IOException { return getProposals("", Field.PROJ); } /** * Need to reset searcher when index has been updated * Otherwise changes won't appear in the results * It is recommended to do a warmup search after the searcher was reset * */ public void onIndexReset() { resetSearcher(); } private void resetSearcher() { if( indexSearcher != null ) { try { indexSearcher.close(); } catch (IOException e) { config.log(e); } finally { indexSearcher = null; } } } /** * Warmup search. Searches after this will be faster * * @throws Exception */ private void warmup() throws Exception { search(new SearchQuery("<warmup search>", 1)); } /** * Closes and re-opens the index for searching * Should be called after the index is updated * */ public void onIndexUpdate() { resetSearcher(); try { warmup(); } catch (Exception e) { config.log(e); } } private Query parseSearchQuery(SearchQuery searchQuery, IndexReader reader, boolean exact, boolean prefix) throws ParseException, IOException { String searchString = searchQuery.getSearchString(); BooleanQuery.setMaxClauseCount(5000); // so we don't get TooManyClauses exceptions Query exactQuery = createExactQuery(searchQuery); Query returnQuery; if( exact ) // want exact search, use KeywordAnalyzer { returnQuery = exactQuery; } else { Query query = parserSearchString(searchString, queryAnalyzer); exactQuery.setBoost(query.getBoost() * 2f); // exact query more important returnQuery = combineQueries(query, exactQuery); } returnQuery = rewriteQuery(searchQuery, prefix, returnQuery); returnQuery = returnQuery.rewrite(reader); // lucene's rewrite (ie expand prefix queries) //System.out.println("q: " + returnQuery + " - exact " + exact); return returnQuery; } /** * @param searchQuery * @return * @throws ParseException */ private Query createExactQuery(SearchQuery searchQuery) throws ParseException { Query query = null; String searchString = searchQuery.getSearchString(); if( searchString.contains(" ") ) { query = parserSearchString(searchString, new StandardAnalyzer(LUCENE_VERSION)); query = convertToPhraseQuery(query); } else { query = parserSearchString(searchString, new KeywordAnalyzer()); // searchstring is one term } VisitableQuery visitableQuery = new VisitableQuery(query); visitableQuery.accept(uppercaseNameExpander); visitableQuery.accept(fileNameSearcher); query = visitableQuery.getQuery(); return query; } private static Query convertToPhraseQuery(Query query) { PhraseQuery phraseQuery = new PhraseQuery(); Set<Term> terms = new LinkedHashSet<Term>(); try { query.extractTerms(terms); for(Term term: terms) { Field field = Field.fromTerm(term); if( Field.CONTENTS == field ) phraseQuery.add(term); else return query; } } catch(UnsupportedOperationException ignored) // not all queries support it { return query; } return phraseQuery; } private Query rewriteQuery(SearchQuery searchQuery, boolean prefix, Query query) { VisitableQuery visitableQuery = new VisitableQuery(query); visitableQuery.accept(uppercaseNameExpander); visitableQuery.accept(lowercaseConverter); if( searchQuery.isFuzzy() ) visitableQuery.accept(queryFuzzifier); visitableQuery.accept(csvExpander); visitableQuery.accept(fieldAliasConverter); visitableQuery.accept(modifiedTimeConverter); visitableQuery.accept(folderSearcher); if( prefix && !searchQuery.isFuzzy() ) // prefix last term query for substring search while typing visitableQuery.accept(lastTermQueryPrefixer); if( searchQuery.getFilter() != null ) { filterSetter.setFilter( searchQuery.getFilter() ); visitableQuery.accept(filterSetter); } visitableQuery.accept(workingSetExpander); if( searchQuery.getCurrentProject() != null ) { currentProjectSetter.setCurrentProject(searchQuery.getCurrentProject()); visitableQuery.accept(currentProjectSetter); } return visitableQuery.getQuery(); // the modified query after all visitors } private BooleanQuery combineQueries(Query query, Query exactQuery) { BooleanQuery topQuery = new BooleanQuery(); topQuery.add(exactQuery, Occur.SHOULD); topQuery.add(query, Occur.SHOULD); return topQuery; } private Query parserSearchString(String searchString, Analyzer analyzer) throws ParseException { QueryParser queryParser = new QueryParser(LUCENE_VERSION, Field.CONTENTS.toString(), analyzer); queryParser.setDefaultOperator(Operator.AND); // all fields required queryParser.setLowercaseExpandedTerms(false); queryParser.setPhraseSlop(DEFAULT_PHRASE_SLOP); /* * Allow words in the query to begin with * * see http://lucene.apache.org/java/2_3_1/api/org/apache/lucene/queryParser/QueryParser.html#setAllowLeadingWildcard%28boolean%29 */ queryParser.setAllowLeadingWildcard(true); Query parsedQuery = queryParser.parse(searchString); return parsedQuery; } /** * Extracts search terms and their weights * @param query * @return */ private static Map<String, Float> extractTerms(Query query) { WeightedTerm[] weightedTerms = QueryTermExtractor.getTerms(query, false, Field.CONTENTS.toString()); Map<String,Float> terms = new HashMap<String,Float>(weightedTerms.length); for(WeightedTerm weightedTerm: weightedTerms) terms.put( weightedTerm.getTerm(), weightedTerm.getWeight() ); return terms; } protected void initPrefs() { fuzzySearchAuto = config.getBoolPref(PreferenceConstants.P_FUZZY_SEARCH_AUTO); showMatchCounts = config.getBoolPref(PreferenceConstants.P_SHOW_MATCH_COUNT); } @Override public void propertyChange(PropertyChangeEvent event) { String prop = event.getProperty(); if( PreferenceConstants.P_SHOW_MATCH_COUNT.equals(prop) ) showMatchCounts = config.getBoolPref(PreferenceConstants.P_SHOW_MATCH_COUNT); else if( PreferenceConstants.P_FUZZY_SEARCH_AUTO.equals(prop) ) fuzzySearchAuto = config.getBoolPref(PreferenceConstants.P_FUZZY_SEARCH_AUTO); } }