Searcher.java example

Explorer

eclipse-instasearch-master
- instasearch
  - src
    - it
      - unibz
        instasearch
        InstaSearch.java
        InstaSearchPlugin.java
        InstaSearchStartup.java
        actions
        BuildIndexActionDelegate.java
        CheckUpdatesActionDelegate.java
        CollapseAllActionDelegate.java
        DeleteIndexActionDelegate.java
        ExpandAllActionDelegate.java
        IncrementalSearchActionDelegate.java
        InstaSearchActionDelegate.java
        OpenSearchDialog.java
        PreferencesAction.java
        ShowExceptionAction.java
        ShowInstaSearchAction.java
        VisitHomePageActionDelegate.java
        indexing
        Field.java
        FileAnalyzer.java
        LengthNormSimilarity.java
        QueryAnalyzer.java
        ResourceCollector.java
        SearchQuery.java
        SearchResult.java
        SearchResultDoc.java
        Searcher.java
        StorageIndexer.java
        WorkspaceIndexer.java
        WorkspaceIndexerJDT.java
        querying
        CSVExpander.java
        CurrentProjectSetter.java
        FieldAliasConverter.java
        FileNameSearcher.java
        FilterSetter.java
        FolderSearcher.java
        LastTermQueryPrefixer.java
        LowercaseConverter.java
        ModifiedTimeConverter.java
        PhraseSearcher.java
        QueryFuzzifier.java
        QueryVisitor.java
        UppercaseNameExpander.java
        VisitableQuery.java
        WorkingSetExpander.java
        tokenizers
        CamelCaseTokenizer.java
        DotSplitTokenizer.java
        TermSplitTokenizer.java
        WordSplitTokenizer.java
        standard
        CharStream.java
        FastCharStream.java
        ParseException.java
        StandardTokenizer.java
        StandardTokenizerConstants.java
        StandardTokenizerTokenManager.java
        Token.java
        TokenMgrError.java
        jobs
        CheckUpdatesJob.java
        DeleteIndexJob.java
        IndexUpdateJob.java
        IndexingJob.java
        UpdatePluginJob.java
        prefs
        InstaSearchPreferencePage.java
        PreferenceConstants.java
        PreferenceInitializer.java
        ui
        DropdownMenuProvider.java
        InstaSearchPage.java
        InstaSearchUI.java
        InstaSearchView.java
        MatchHighlightJob.java
        ReportErrorDialog.java
        ResultContentProvider.java
        ResultLabelProvider.java
        SearchContentProposalProvider.java
        SearchJob.java
        SearchViewControl.java
        StyledTextContentAdapter.java
  - test
    - it
      - unibz
        instasearch
        indexing
        SearcherTest.java
        TestSearcher.java
        TestStorage.java

/*
 * Copyright (c) 2009 Andrejs Jermakovics.
 * 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     Andrejs Jermakovics - initial implementation
 */
package it.unibz.instasearch.indexing;

import it.unibz.instasearch.indexing.StorageIndexer.IndexChangeListener;
import it.unibz.instasearch.indexing.querying.CSVExpander;
import it.unibz.instasearch.indexing.querying.CurrentProjectSetter;
import it.unibz.instasearch.indexing.querying.FieldAliasConverter;
import it.unibz.instasearch.indexing.querying.FileNameSearcher;
import it.unibz.instasearch.indexing.querying.FilterSetter;
import it.unibz.instasearch.indexing.querying.FolderSearcher;
import it.unibz.instasearch.indexing.querying.LastTermQueryPrefixer;
import it.unibz.instasearch.indexing.querying.LowercaseConverter;
import it.unibz.instasearch.indexing.querying.ModifiedTimeConverter;
import it.unibz.instasearch.indexing.querying.QueryFuzzifier;
import it.unibz.instasearch.indexing.querying.UppercaseNameExpander;
import it.unibz.instasearch.indexing.querying.VisitableQuery;
import it.unibz.instasearch.indexing.querying.WorkingSetExpander;
import it.unibz.instasearch.prefs.PreferenceConstants;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.QueryParser.Operator;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanQuery.TooManyClauses;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.QueryTermExtractor;
import org.apache.lucene.search.highlight.WeightedTerm;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import org.eclipse.jface.util.IPropertyChangeListener;
import org.eclipse.jface.util.PropertyChangeEvent;

/**
 * Searcher for searching the index using SearchQuery
 */
@SuppressWarnings("deprecation")
public class Searcher implements IPropertyChangeListener, IndexChangeListener {

	/**  @see QueryParser#setPhraseSlop(int) 	*/
	private static final int DEFAULT_PHRASE_SLOP = 0;
	/** Minimum length of query in characters */
	public static final int MIN_QUERY_LENGTH = 2;
	/** Minimum number of characters to be considered a word in indexing */
	public static final int MIN_WORD_LENGTH = 1;
	/** Character that identifies the current project in search query */
	public static final String CURRENT_PROJECT_CHAR = ".";
	private static final Version LUCENE_VERSION = Version.LUCENE_29;
	
	private IndexSearcher indexSearcher;
	
	public static final LengthNormSimilarity SIMILARITY = new LengthNormSimilarity();  //TODO: share with WorkspaceIndexer
	private static final QueryAnalyzer queryAnalyzer = new QueryAnalyzer(MIN_WORD_LENGTH);
	
	// Query visitors that modify the search query
	private CSVExpander csvExpander = new CSVExpander();
	private WorkingSetExpander workingSetExpander = new WorkingSetExpander();
	private UppercaseNameExpander uppercaseNameExpander = new UppercaseNameExpander();
	private FieldAliasConverter fieldAliasConverter = new FieldAliasConverter();
	private ModifiedTimeConverter modifiedTimeConverter = new ModifiedTimeConverter();
	private LowercaseConverter lowercaseConverter = new LowercaseConverter();
	private FolderSearcher folderSearcher = new FolderSearcher();
	private FileNameSearcher fileNameSearcher = new FileNameSearcher();
	private CurrentProjectSetter currentProjectSetter = new CurrentProjectSetter();
	private QueryFuzzifier queryFuzzifier = new QueryFuzzifier();
	private LastTermQueryPrefixer lastTermQueryPrefixer = new LastTermQueryPrefixer(MIN_QUERY_LENGTH+1);
	private FilterSetter filterSetter = new FilterSetter();

	private boolean showMatchCounts = true;
	private boolean fuzzySearchAuto = true;
	private SearcherConfig config;

	public interface SearcherConfig
	{
		boolean getBoolPref(String pref);
		void log(Exception e);
		Directory getIndexDir() throws IOException;
	}
	
	/**
	 * Searcher
	 */
	public Searcher(SearcherConfig config) {
		this.config = config;
		initPrefs();
	}
	
	protected Directory getIndexDir() throws IOException {
		return config.getIndexDir(); 
	}
	
	private SearchResult searchIndex(SearchQuery searchQuery) throws Exception {
		
		IndexSearcher indexSearcher = getIndexSearcher();
		IndexReader reader = indexSearcher.getIndexReader();
		boolean exact = searchQuery.isExact();
		
		Query query = null;
		
		try {
			query = parseSearchQuery(searchQuery, reader, exact, true);
			
		} catch(TooManyClauses e) { // too many, try without prefix search
			query = parseSearchQuery(searchQuery, reader, exact, false);
			
		} catch(ParseException e) {
			
			// remove special query characters
			String newSearchString = searchQuery.getSearchString().replaceAll("[\\(\\)\"\\[\\]'\\{\\}]", " ");
			
			try {
				searchQuery.setSearchString(newSearchString);
				query = parseSearchQuery(searchQuery, reader, exact, true);
				
			} catch(ParseException ignored) {
				// can have error while typing query, just ignore
				//debug(newSearchString, " - ", ignored.getMessage());
				return null; 
			}
		}
		
		SearchResult searchResut = collectSearchResults(searchQuery, indexSearcher, reader, query);
		
		return searchResut;
	}

	private SearchResult collectSearchResults(SearchQuery searchQuery,
			IndexSearcher indexSearcher, IndexReader reader, Query query)
			throws IOException
	{
		int maxResults = reader.numDocs(); // all documents
		
		if( searchQuery.isLimited() )
			maxResults = searchQuery.getMaxResults();
		
		Map<String, Float> searchTerms = extractTerms(query);
		TopDocCollector collector = new TopDocCollector( maxResults );
		
		if( searchQuery.isCanceled() ) 
			return null;
		
		indexSearcher.search(query, collector); // do the actual search
		
		if( collector.getTotalHits() == 0 )
			return null;
		
		ScoreDoc[] hits = collector.topDocs().scoreDocs;
		ArrayList<SearchResultDoc> resultDocs = new ArrayList<SearchResultDoc>(hits.length);
		
		for (int i = 0; i < hits.length && !searchQuery.isCanceled(); i++) 
		{
			int docId = hits[i].doc;
			float score = hits[i].score;
			
			Document doc = reader.document(docId);
			
			SearchResultDoc resultDoc = new SearchResultDoc(getIndexDir(), doc, docId, score);
			
			if( showMatchCounts )
				resultDoc.computeMatchCount(reader, searchTerms.keySet());
			
			resultDocs.add(resultDoc);
		}
		
		return new SearchResult(searchQuery, resultDocs, searchTerms);
	}

	/**
	 * Performs a search using the search query
	 * @param searchQuery
	 * 
	 * @return SearchResult or null if nothing is found
	 * @throws Exception
	 */
	public SearchResult search(SearchQuery searchQuery) throws Exception {
		
		String searchString = searchQuery.getSearchString();
		
		if( searchString == null || searchString.length() < MIN_QUERY_LENGTH ) 
			return null;
		
		SearchResult result = null;
		
		if( searchQuery.isFuzzy() )
		{
			searchQuery.setExact(false);
			result = searchIndex(searchQuery); // search fuzzy
		}
		else
		{
			// 1. search exact text
			if( searchQuery.isExact() )
				result = searchIndex(searchQuery);
			
			// 2. split search text into tokens and search (non-exact)
			if( (result==null || result.isEmpty()) && !searchQuery.isCanceled() )
			{
				searchQuery.setExact(false);
				result = searchIndex(searchQuery);
			}
			
			// 3. search wildcarded and fuzzy matches
			if( (result==null || result.isEmpty()) && fuzzySearchAuto && !searchQuery.isCanceled() ){ // if fuzzy search enabled
				searchQuery.setFuzzy(true);
				searchQuery.setExact(false);
				
				result = searchIndex(searchQuery);
			}
		}
		
		return result;
	}

	/**
	 * @return the indexSearcher
	 * @throws IOException 
	 */
	private IndexSearcher getIndexSearcher() throws IOException
	{
		if( indexSearcher == null ) 
		{
			indexSearcher = new IndexSearcher( getIndexDir(), true );
			indexSearcher.setSimilarity( SIMILARITY );
		}
		
		// indexSearcher.getIndexReader().isCurrent()
		
		return indexSearcher;
	}
		
	/**
	 * Get all terms that start with prefixText
	 * @param prefixText
	 * @param prefixField
	 * 
	 * @return proposed terms
	 * @throws IOException
	 */
	public List<String> getProposals(String prefixText, Field prefixField) throws IOException 
	{
	    ArrayList<String> proposals = new ArrayList<String>();
	    IndexReader reader = getIndexSearcher().getIndexReader();
	    Term prefix = prefixField.createTerm(prefixText);
	    TermEnum enumerator = reader.terms(prefix);
	    prefixText = prefixText.toLowerCase(Locale.ENGLISH);
	    
	    try {
	      do {
	        Term term = enumerator.term();
	        
	        if (term != null &&
	            term.text().toLowerCase(Locale.ENGLISH).startsWith(prefixText) &&
	            term.field().equalsIgnoreCase(prefixField.toString())) {
	        	
	        	proposals.add(term.text());
	        	
	        } else {
	          break;
	        }
	      } while (enumerator.next());
	    } finally {
	      enumerator.close();
	    }
	    
	    return proposals;
	  }

	/**
	 * Get names of all projects that are in the index
	 * 
	 * @return list of indexed projects
	 * @throws IOException
	 */
	public List<String> getIndexProjects() throws IOException
	{
		return getProposals("", Field.PROJ);
	}
	
	/**
	 * Need to reset searcher when index has been updated
	 * Otherwise changes won't appear in the results
	 * It is recommended to do a warmup search after the searcher was reset
	 * 
	 */
	
	public void onIndexReset() {
		resetSearcher();
	}
	
	private void resetSearcher()
	{
		if( indexSearcher != null ) {
			try {
				indexSearcher.close();
			} catch (IOException e) {
				config.log(e);
			} finally {
				indexSearcher = null;
			}	
		}
	}
	
	/**
	 * Warmup search. Searches after this will be faster
	 * 
	 * @throws Exception
	 */
	private void warmup() throws Exception {
		search(new SearchQuery("<warmup search>", 1));
	}

	/**
	 * Closes and re-opens the index for searching
	 * Should be called after the index is updated
	 * 
	 */
	public void onIndexUpdate() {
		
		resetSearcher();
		
		try {
			warmup();
		} catch (Exception e) {
			config.log(e);
		}
	}

	private Query parseSearchQuery(SearchQuery searchQuery, IndexReader reader, boolean exact, boolean prefix) throws ParseException, IOException
	{
		String searchString = searchQuery.getSearchString();
		
		BooleanQuery.setMaxClauseCount(5000); // so we don't get TooManyClauses exceptions
		
		Query exactQuery = createExactQuery(searchQuery);
		Query returnQuery;
		
		if( exact ) // want exact search, use KeywordAnalyzer
		{
			returnQuery = exactQuery;
		}
		else
		{
			Query query = parserSearchString(searchString, queryAnalyzer);
			exactQuery.setBoost(query.getBoost() * 2f); // exact query more important
			returnQuery = combineQueries(query, exactQuery);
		}
		
		returnQuery = rewriteQuery(searchQuery, prefix, returnQuery);
		
		returnQuery = returnQuery.rewrite(reader); // lucene's rewrite (ie expand prefix queries)
		//System.out.println("q: " + returnQuery + " - exact " + exact);
		
		return returnQuery;
	}

	/**
	 * @param searchQuery
	 * @return
	 * @throws ParseException 
	 */
	private Query createExactQuery(SearchQuery searchQuery) throws ParseException
	{
		Query query = null;
		String searchString = searchQuery.getSearchString();
		
		if( searchString.contains(" ") )
		{
			query = parserSearchString(searchString, new StandardAnalyzer(LUCENE_VERSION));
			query = convertToPhraseQuery(query);
		}
		else
		{
			query = parserSearchString(searchString, new KeywordAnalyzer()); // searchstring is one term
		}
		
		VisitableQuery visitableQuery = new VisitableQuery(query);
		
		visitableQuery.accept(uppercaseNameExpander);
		visitableQuery.accept(fileNameSearcher);
		
		query = visitableQuery.getQuery();
		
		return query;
	}

	private static Query convertToPhraseQuery(Query query)
	{
		PhraseQuery phraseQuery = new PhraseQuery();
		
		Set<Term> terms = new LinkedHashSet<Term>();
		
		try
		{
			query.extractTerms(terms);

			for(Term term: terms)
			{
				Field field = Field.fromTerm(term);

				if( Field.CONTENTS == field  )
					phraseQuery.add(term);
				else
					return query;
			}
		}
		catch(UnsupportedOperationException ignored) // not all queries support it 
		{
			return query;
		}
		
		return phraseQuery;
	}

	private Query rewriteQuery(SearchQuery searchQuery, boolean prefix, Query query) 
	{
		VisitableQuery visitableQuery = new VisitableQuery(query);
		
		visitableQuery.accept(uppercaseNameExpander);
		
		visitableQuery.accept(lowercaseConverter);
		
		if( searchQuery.isFuzzy() )
			visitableQuery.accept(queryFuzzifier);
		
		visitableQuery.accept(csvExpander);
		visitableQuery.accept(fieldAliasConverter);
		visitableQuery.accept(modifiedTimeConverter);
		visitableQuery.accept(folderSearcher);
		
		if( prefix && !searchQuery.isFuzzy() ) // prefix last term query for substring search while typing
			visitableQuery.accept(lastTermQueryPrefixer);
		
		if( searchQuery.getFilter() != null )
		{
			filterSetter.setFilter( searchQuery.getFilter() );
			visitableQuery.accept(filterSetter);
		}
		
		visitableQuery.accept(workingSetExpander);
		
		if( searchQuery.getCurrentProject() != null )
		{
			currentProjectSetter.setCurrentProject(searchQuery.getCurrentProject());
			visitableQuery.accept(currentProjectSetter);
		}
		
		return visitableQuery.getQuery();  // the modified query after all visitors
	}

	private BooleanQuery combineQueries(Query query, Query exactQuery)
	{
		BooleanQuery topQuery = new BooleanQuery();
		topQuery.add(exactQuery, Occur.SHOULD);
		topQuery.add(query, Occur.SHOULD);
		return topQuery;
	}

	private Query parserSearchString(String searchString, Analyzer analyzer) throws ParseException 
	{
		QueryParser queryParser = new QueryParser(LUCENE_VERSION, Field.CONTENTS.toString(), analyzer);
		queryParser.setDefaultOperator(Operator.AND); // all fields required
		queryParser.setLowercaseExpandedTerms(false);
		queryParser.setPhraseSlop(DEFAULT_PHRASE_SLOP);

		/*
		 * Allow words in the query to begin with *
		 * see http://lucene.apache.org/java/2_3_1/api/org/apache/lucene/queryParser/QueryParser.html#setAllowLeadingWildcard%28boolean%29
		 */
		queryParser.setAllowLeadingWildcard(true);

		Query parsedQuery = queryParser.parse(searchString);
		
		return parsedQuery;
	}
	
	/**
	 * Extracts search terms and their weights
	 * @param query
	 * @return
	 */
	private static Map<String, Float> extractTerms(Query query)
	{
		WeightedTerm[] weightedTerms = QueryTermExtractor.getTerms(query, false, Field.CONTENTS.toString());
		
		Map<String,Float> terms = new HashMap<String,Float>(weightedTerms.length);
		
		for(WeightedTerm weightedTerm: weightedTerms)
			terms.put( weightedTerm.getTerm(), weightedTerm.getWeight() );
		
		return terms;
	}

	protected void initPrefs() 
	{
		fuzzySearchAuto = config.getBoolPref(PreferenceConstants.P_FUZZY_SEARCH_AUTO);
		showMatchCounts = config.getBoolPref(PreferenceConstants.P_SHOW_MATCH_COUNT);
	}

	@Override
	public void propertyChange(PropertyChangeEvent event) {
		String prop = event.getProperty();
		
		if( PreferenceConstants.P_SHOW_MATCH_COUNT.equals(prop) )
			showMatchCounts = config.getBoolPref(PreferenceConstants.P_SHOW_MATCH_COUNT);
		else if( PreferenceConstants.P_FUZZY_SEARCH_AUTO.equals(prop) )
			fuzzySearchAuto = config.getBoolPref(PreferenceConstants.P_FUZZY_SEARCH_AUTO);
	}

}