/** * FreeDesktopSearch - A Search Engine for your Desktop * Copyright (C) 2013 Mirko Sertic * * This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program; if not, see <http://www.gnu.org/licenses/>. */ package de.mirkosertic.desktopsearch; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.TokenGroup; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; class SearchPhraseSuggester { private static final Logger LOGGER = Logger.getLogger(SearchPhraseSuggester.class); private final IndexSearcher indexSearcher; private final Analyzer analyzer; private final Configuration configuration; public SearchPhraseSuggester(IndexSearcher aIndexSearcher, Analyzer aAnalyzer, Configuration aConfiguration) { indexSearcher = aIndexSearcher; analyzer = aAnalyzer; configuration = aConfiguration; } public List<Suggestion> suggestSearchPhrase(String aFieldName, String aPhrase) throws IOException { LOGGER.info("Trying to find suggestions for phrase " + aPhrase); List<String> theTokens = toTokens(aFieldName, aPhrase); List<SpanQuery> theSpanQueries = theTokens.stream().map(s -> { if (QueryUtils.isWildCard(s)) { WildcardQuery theWildcardQuery = new WildcardQuery(new Term(aFieldName, s)); SpanMultiTermQueryWrapper theWrapper = new SpanMultiTermQueryWrapper(theWildcardQuery); try { return theWrapper.getRewriteMethod().rewrite(indexSearcher.getIndexReader(), theWildcardQuery); } catch (IOException e) { throw new RuntimeException(e); } } return new SpanTermQuery(new Term(aFieldName, s)); }).collect(Collectors.toList()); Query theQuery; if (theSpanQueries.size() > 1) { theQuery = new SpanNearQuery(theSpanQueries.toArray(new SpanQuery[theSpanQueries.size()]), configuration.getSuggestionSlop(), configuration.isSuggestionInOrder()); } else { theQuery = theSpanQueries.get(0); } LOGGER.info("created span query " + theQuery); ArrayList<Suggestion> theResult = new ArrayList<>(); Highlighter theHighligher = new Highlighter(new Formatter() { @Override public String highlightTerm(String aSpan, TokenGroup tokenGroup) { return aSpan; } }, new QueryScorer(theQuery)); TopDocs theDocs = indexSearcher.search(theQuery, configuration.getNumberOfSuggestions(), Sort.RELEVANCE); for (int i=0;i<theDocs.scoreDocs.length;i++) { Document theDocument = indexSearcher.getIndexReader().document(theDocs.scoreDocs[i].doc); String theOriginalContent = theDocument.getField(aFieldName).stringValue(); try { for (String theFragment : theHighligher.getBestFragments(analyzer, aFieldName, theOriginalContent, 1)) { // Erstes Token ermitteln int p = theFragment.toLowerCase().indexOf(theTokens.get(0).toLowerCase()); if (p>0) { theFragment = theFragment.substring(p).trim(); } theResult.add(new Suggestion(highlight(theFragment, theTokens), theFragment)); } } catch (Exception e) { LOGGER.error(e); } } return theResult; } private String highlight(String aPhrase, List<String> aTokens) { String theResult = aPhrase; Set<String> theTokens = aTokens.stream().map(String::toLowerCase).collect(Collectors.toSet()); for (String theToken : theTokens) { Pattern thePattern = Pattern.compile("(" + theToken+")", Pattern.CASE_INSENSITIVE); Matcher theMatcher = thePattern.matcher(aPhrase); Set<String> theReplacements = new HashSet<>(); while(theMatcher.find()) { theReplacements.add(theMatcher.group()); } for (String theReplacement : theReplacements) { theResult = theResult.replace(theReplacement, "<b>"+theReplacement+"</b>"); } } return theResult; } private String analyze(String aFieldName, String aString) throws IOException { TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString); theTokenStream.reset(); CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class); try { if (theTokenStream.incrementToken()) { return theCharTerms.toString(); } return null; } finally { theTokenStream.end(); theTokenStream.close(); } } private List<String> toTokens(String aFieldName, String aPhrase) throws IOException { List<String> theTokens = new ArrayList<>(); String[] theSplitTokens = StringUtils.split(aPhrase," ,:;?!."); for (int i=0;i<theSplitTokens.length;i++) { String theToken = theSplitTokens[i]; // Mutate the last token to a wildcard if (theToken.length() > 2 && i == theSplitTokens.length - 1 && !QueryUtils.isWildCard(theToken)) { theToken = theToken + QueryUtils.ASTERISK; } if (!theToken.startsWith("-")) { if (QueryUtils.isWildCard(theToken)) { theTokens.add(theToken); } else { String theAnalyzed = analyze(aFieldName, theToken); if (theAnalyzed != null) { theTokens.add(theAnalyzed); } } } } return theTokens; } }