/* * Copyright (c) 2009 Andrejs Jermakovics. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Andrejs Jermakovics - initial implementation */ package it.unibz.instasearch.ui; import it.unibz.instasearch.InstaSearch; import it.unibz.instasearch.InstaSearchPlugin; import it.unibz.instasearch.indexing.Field; import it.unibz.instasearch.indexing.SearchQuery; import it.unibz.instasearch.indexing.SearchResult; import it.unibz.instasearch.indexing.SearchResultDoc; import it.unibz.instasearch.indexing.Searcher; import it.unibz.instasearch.indexing.StorageIndexer; import it.unibz.instasearch.indexing.WorkspaceIndexer; import it.unibz.instasearch.prefs.PreferenceConstants; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import org.eclipse.core.resources.IFile; import org.eclipse.core.resources.IStorage; import org.eclipse.jface.viewers.ITreeContentProvider; import org.eclipse.jface.viewers.Viewer; import org.eclipse.search.ui.text.Match; import org.eclipse.ui.IEditorInput; class ResultContentProvider implements ITreeContentProvider { private static final String[] NO_INDEX_MESSAGE = new String[] {"Index is not built"}; private static final String[] NO_FILE_MESSAGE = new String[] {"File missing"}; private static final String NO_RESULTS_MESSAGE = "0 matches"; public static final double MAX_LINE_SIMILARITY = 0.9; // if above that, only one of the similar lines is kept public static final int MAX_LINES_TO_PROCESS = 5000; // max nr of lines to read from files (prevent slowdown on huge files) private WorkspaceIndexer indexer; private Searcher searcher; /** * Map of search terms to their boost score */ private SearchQuery currentSearchQuery; private Map<String, Float> searchTerms; private Object[] cachedResults; private SearchResultDoc cachedResultDoc; private Object[] cachedChildren; private int resultCount; public ResultContentProvider() { InstaSearch instaSearch = InstaSearchPlugin.getInstaSearch(); this.indexer = instaSearch.getIndexer(); this.searcher = instaSearch.getSearcher(); } public void inputChanged(Viewer v, Object oldInput, Object newSearch) { if( newSearch==null || !(newSearch instanceof SearchQuery) ) { currentSearchQuery = null; // clear cache cachedResults = null; } } public void dispose() { } public Object[] getElements(Object searchQueryObj) { if( searchQueryObj==null || !(searchQueryObj instanceof SearchQuery) ) return Collections.EMPTY_LIST.toArray(); // not searching SearchQuery searchQuery = (SearchQuery) searchQueryObj; String searchString = searchQuery.getSearchString(); if( searchString==null || searchString.toString().length() < Searcher.MIN_QUERY_LENGTH ) return Collections.EMPTY_LIST.toArray(); try { if( ! indexer.isIndexed() ) return NO_INDEX_MESSAGE; } catch(Exception e) { InstaSearchPlugin.log(e); return new Exception[]{e}; } if( searchQuery.equals(currentSearchQuery) && cachedResults != null ) // same query return cachedResults; SearchResult result = null; cachedResults = null; resultCount = 0; Object[] resultArray = null; if(searchString != null) { try { result = searcher.search(searchQuery); // do the search currentSearchQuery = searchQuery; if( result == null ) { if( !searchQuery.isFuzzy() ) { SearchQuery newQuery = new SearchQuery(searchQuery); newQuery.setExact(false); newQuery.setFuzzy(true); resultArray = new Object[]{NO_RESULTS_MESSAGE, newQuery}; // add fuzzy query } else { resultArray = new Object[]{NO_RESULTS_MESSAGE}; } cachedResults = resultArray; return resultArray; } searchTerms = result.getSearchTerms(); } catch (Exception e) { InstaSearchPlugin.log(e); return new Exception[]{e}; } } List<SearchResultDoc> resultDocs = result.getResultDocs(); this.resultCount = resultDocs.size(); boolean addMoreResults = false, addFindSimilar = false; if( searchQuery.isLimited() && result.isFull() ) { // if only showing limited number of matches addMoreResults = true; } else { if( searchQuery.isExact() && !searchQuery.isFuzzy() ) // if query is exact, can try search for individual tokens addFindSimilar = true; } if( addMoreResults || addFindSimilar ) resultArray = new Object[resultCount + 1]; // +1 because we append "More..." element (a SearchQuery object) else resultArray = new Object[resultCount]; resultDocs.toArray(resultArray); if( addMoreResults ) { // if more results, create More result entry (return all elements) SearchQuery moreResultsQuery = new SearchQuery(currentSearchQuery); moreResultsQuery.setMaxResults(SearchQuery.UNLIMITED_RESULTS); // all results resultArray[resultCount] = moreResultsQuery; } else if( addFindSimilar ) { SearchQuery findSimilarQuery = new SearchQuery(currentSearchQuery); findSimilarQuery.setExact(false); resultArray[resultCount] = findSimilarQuery; } cachedResults = resultArray; return resultArray; } /** * Count of search result docs from last search * @return result count * */ public int getResultCount() { return resultCount; } /** * Returns last search result elements. * Includes search result docs and additional action entries (eg More results) * * @return array of last search result elements */ public Object[] getElements() { return cachedResults; } public Object[] getChildren(Object parent) { if( parent instanceof SearchResultDoc ) { SearchResultDoc doc = (SearchResultDoc) parent; if( cachedResultDoc != null && doc.equals(cachedResultDoc) ) return cachedChildren; // cache results List<MatchLine> matches = null; Object[] children = null; try { matches = getMatchLines(doc, true, null); if( matches != null ) children = matches.toArray(); } catch (Exception e) { InstaSearchPlugin.log(e); } if( matches == null ) children = NO_FILE_MESSAGE; cachedResultDoc = doc; cachedChildren = children; return children; } else if( parent instanceof Exception ) { Exception e = (Exception) parent; return e.getStackTrace(); } return Collections.EMPTY_LIST.toArray(); } public interface MatchFindCallback { void matchFound(MatchLine line); boolean isCanceled(); } /** * Returns matched lines * @param doc * @param limit * @return * @throws Exception */ List<MatchLine> getMatchLines(SearchResultDoc doc, boolean limit, MatchFindCallback callback) throws Exception { if( searchTerms == null || currentSearchQuery == null ) return Collections.emptyList(); int maxMatches = InstaSearchPlugin.getIntPref(PreferenceConstants.P_SHOWN_LINES_COUNT); List<MatchLine> matchedLines = new ArrayList<MatchLine>(); int matchCount = doc.getMatchCount(); String searchString = currentSearchQuery.getSearchString().toLowerCase(Locale.ENGLISH); IStorage f = getStorage(doc); if( f == null ) { // index might be outdated (disabled updating) //TODO: remove file from index (update index) return null; } InputStream fileInputStream = null; if( f instanceof IFile ) { IFile file = (IFile) f; if( !file.exists() ) return null; fileInputStream = file.getContents(true); } else { fileInputStream = f.getContents(); } LineNumberReader lineReader = new LineNumberReader(new InputStreamReader(fileInputStream)); // is a buffered reader String line; // Read through file one line at a time while ( (line = lineReader.readLine()) != null ) { if( callback != null && callback.isCanceled() ) break; //if( currentSearchQuery.isCanceled() ) break; if( "".equals(line) ) continue; Map<String, List<Integer>> lineTerms = StorageIndexer.extractTextTerms(line); if( lineTerms.isEmpty() ) continue; HashSet<String> matchedTerms = new HashSet<String>(searchTerms.keySet()); // search terms that appear on this line matchedTerms.retainAll(lineTerms.keySet()); if( matchedTerms.isEmpty() && matchCount != 0 && limit ) // if have matches in general, but not on this line, then skip continue; float[] lineTermScoreVector = doc.getTermScoreVector(lineTerms.keySet()); float[] matchedTermScoreVector = doc.getTermScoreVector(matchedTerms); MatchLine matchLine = new MatchLine(doc, line, lineReader.getLineNumber(), matchedTerms, lineTermScoreVector, matchedTermScoreVector); matchedLines.add(matchLine); addMatches(matchLine, lineTerms, matchedTerms, searchString); if( callback != null ) callback.matchFound(matchLine); if( lineReader.getLineNumber() > MAX_LINES_TO_PROCESS ) break; //TODO: break if all current matches have high score (eg >0.9) } lineReader.close(); if(limit && matchedLines.size() > maxMatches) { matchedLines = getTopMatchLines(maxMatches, matchedLines); // return TOP N lines return matchedLines; } else return matchedLines; } private List<MatchLine> getTopMatchLines(int maxMatchLines, List<MatchLine> matchedLines) { Collections.sort(matchedLines); // sort by match count, score, line removeSimilarLines(matchedLines, maxMatchLines); matchedLines = matchedLines.subList(0, maxMatchLines); // top N results Collections.sort(matchedLines, new Comparator<MatchLine>() { // sort by line number for display public int compare(MatchLine l1, MatchLine l2) { return l1.getLineNumber() - l2.getLineNumber(); } }); return matchedLines; } /** * Find matches on the line * * @param matchLine * @param terms * @param matchedTerms * @param searchString * @return */ private float addMatches(MatchLine matchLine, Map<String, List<Integer>> terms, Set<String> matchedTerms, String searchString) { String lcaseLine = matchLine.getLine().toLowerCase(Locale.ENGLISH); if( !matchedTerms.contains(searchString) && !currentSearchQuery.isFuzzy() ) { // check for exact match on the line int pos = lcaseLine.indexOf(searchString); while( pos != -1 ) { Match m = new Match(searchString, pos, searchString.length()); matchLine.add(m, true); pos = lcaseLine.indexOf(searchString, pos + searchString.length() - 1); } } float matchedTermBoost = 0; for(String term: matchedTerms) { List<Integer> offsets = terms.get(term); for(int offset: offsets) { int pos = lcaseLine.indexOf(term, offset); if( pos == -1 ) continue; Match m = new Match(term, pos, term.length()); matchLine.add(m); } float boost = searchTerms.get(term); matchedTermBoost += boost; } matchLine.setMatchedTermBoost(matchedTermBoost); return matchedTermBoost; } /** * Removes similar lines from line matches. * Even if they are high scored, we don not want to see the same lines again * Line similarity is based on Cosine between their corresponding term vectors * * @param matchedLines * @param maxMatches */ private void removeSimilarLines(List<MatchLine> matchedLines, int maxMatches) { MatchLine curMatchLine = null; int lineNr = 0; for (Iterator<MatchLine> iterator = matchedLines.iterator(); iterator.hasNext() && matchedLines.size()>maxMatches; ) { MatchLine matchLine = iterator.next(); if( curMatchLine == null ) { curMatchLine = matchLine; lineNr++; continue; } double similarity = getLineSimilarity(curMatchLine, matchLine); if( similarity > MAX_LINE_SIMILARITY ) iterator.remove(); // since lines are sorted by score, lowest score line will be removed else { curMatchLine = matchLine; lineNr++; } if( lineNr == maxMatches ) break; } } /** * Calculates similarity based on the Cosine angle between score vectors of each line. * * @param lineMatches1 * @param lineMatches2 * @return */ private double getLineSimilarity(MatchLine lineMatches1, MatchLine lineMatches2) { float[] vect1 = lineMatches1.getScoreVector(); float[] vect2 = lineMatches2.getScoreVector(); double dotProduct = 0.0; double magnitude1 = 0.0; double magnitude2 = 0.0; for (int i = 0; i < vect1.length ; i++) { double val1 = vect1[i]; double val2 = vect2[i]; magnitude1 += val1 * val1; magnitude2 += val2 * val2; dotProduct += val1 * val2; } magnitude1 = Math.sqrt(magnitude1); magnitude2 = Math.sqrt(magnitude2); return (magnitude1 == 0 || magnitude2 == 0) ? 0 : dotProduct / (magnitude1 * magnitude2); } public Object getParent(Object element) { return null; } public boolean hasChildren(Object element) { return ( element instanceof SearchResultDoc ) || ( element instanceof SearchQuery ); } public Collection<String> getSearchTerms() { return searchTerms.keySet(); } public IEditorInput getEditorInput(SearchResultDoc doc) throws Exception { return indexer.getEditorInput(doc); } public IStorage getStorage(SearchResultDoc doc) throws Exception { return indexer.getStorage(doc); } public List<String> getProposals(String prefix, Field field) throws IOException { List<String> ucaseProposals = searcher.getProposals(prefix.toUpperCase(), field); if( prefix.toUpperCase().equals(prefix.toLowerCase(Locale.ENGLISH))) return ucaseProposals; List<String> lcaseProposals = searcher.getProposals(prefix.toLowerCase(Locale.ENGLISH), field); ucaseProposals.addAll(lcaseProposals); Collections.sort(ucaseProposals, String.CASE_INSENSITIVE_ORDER); return ucaseProposals; } /** * A class representing a line in a document and containing some keyword matches */ class MatchLine implements Comparable<MatchLine> { private String lineText; private List<Match> matches = new LinkedList<Match>(); private SearchResultDoc doc; private int lineNumber; private double termScore; private double matchedTermScore; private float[] scoreVector; private float matchedTermBoost; private int exactMatches; private int matchedTermCount; private MatchLine(SearchResultDoc doc, String lineText, int lineNumber, Set<String> matchedTerms, float[] termScoreVector, float[] matchedTermScoreVector) throws IOException { this.doc = doc; this.lineText = lineText; this.lineNumber = lineNumber; this.scoreVector = termScoreVector; termScore = getMagnitude(termScoreVector); matchedTermScore = getMagnitude(matchedTermScoreVector); matchedTermCount = matchedTerms.size(); } public void setMatchedTermBoost(float matchedTermBoost) { this.matchedTermBoost = matchedTermBoost; } public float getMatchedTermBoost() { return matchedTermBoost; } public void add(Match m) { matches.add(m); } public void add(Match m, boolean isExactMatch) { matches.add(m); if( isExactMatch ) this.exactMatches++; } public List<Match> getMatches() { return matches; } public String getLine() { return lineText; } public int getLineNumber() { return lineNumber; } public double getTermScore() { return termScore; } public float[] getScoreVector() { return scoreVector; } public double getMatchedTermScore() { return matchedTermScore; } public int compareTo(MatchLine lineMatches) { // to sort by match count and then by line number int diff = lineMatches.exactMatches - exactMatches; if( diff == 0 ) diff = lineMatches.matchedTermCount - matchedTermCount; if( diff == 0 ) Double.compare(lineMatches.matchedTermBoost, matchedTermBoost); if(diff == 0) diff = Double.compare(lineMatches.termScore, termScore); if(diff == 0) return getLineNumber() - lineMatches.lineNumber; // smaller to bigger return diff; } public SearchResultDoc getResultDoc() { return doc; } /** * Vector magnitude * * @param vect * @return */ private double getMagnitude(float[] vect) { double magnitude = 0; for(float value: vect) magnitude+=value*value; magnitude = Math.sqrt(magnitude); return magnitude; } @Override public String toString() { return "Line " + lineNumber + ": (" + matchedTermCount + ")" + lineText; } } }