SearchPhraseSuggester.java example

Explorer
FXDesktopSearch-master
- src
  - main
    - java
  - test
    - java
      - de
        mirkosertic
        desktopsearch
        QueryParserTest.java
        QueryTokenizerTest.java
/**
 * FreeDesktopSearch - A Search Engine for your Desktop
 * Copyright (C) 2013 Mirko Sertic
 *
 * This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
package de.mirkosertic.desktopsearch;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenGroup;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

class SearchPhraseSuggester {

    private static final Logger LOGGER = Logger.getLogger(SearchPhraseSuggester.class);

    private final IndexSearcher indexSearcher;
    private final Analyzer analyzer;
    private final Configuration configuration;

    public SearchPhraseSuggester(IndexSearcher aIndexSearcher, Analyzer aAnalyzer, Configuration aConfiguration) {
        indexSearcher = aIndexSearcher;
        analyzer = aAnalyzer;
        configuration = aConfiguration;
    }

    public List<Suggestion> suggestSearchPhrase(String aFieldName, String aPhrase) throws IOException {

        LOGGER.info("Trying to find suggestions for phrase " + aPhrase);

        List<String> theTokens = toTokens(aFieldName, aPhrase);

        List<SpanQuery> theSpanQueries = theTokens.stream().map(s -> {
            if (QueryUtils.isWildCard(s)) {
                WildcardQuery theWildcardQuery = new WildcardQuery(new Term(aFieldName, s));
                SpanMultiTermQueryWrapper theWrapper = new SpanMultiTermQueryWrapper(theWildcardQuery);
                try {
                    return theWrapper.getRewriteMethod().rewrite(indexSearcher.getIndexReader(), theWildcardQuery);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return new SpanTermQuery(new Term(aFieldName, s));
        }).collect(Collectors.toList());

        Query theQuery;
        if (theSpanQueries.size() > 1) {
            theQuery = new SpanNearQuery(theSpanQueries.toArray(new SpanQuery[theSpanQueries.size()]), configuration.getSuggestionSlop(), configuration.isSuggestionInOrder());
        } else {
            theQuery = theSpanQueries.get(0);
        }


        LOGGER.info("created span query " + theQuery);

        ArrayList<Suggestion> theResult = new ArrayList<>();

        Highlighter theHighligher = new Highlighter(new Formatter() {
            @Override
            public String highlightTerm(String aSpan, TokenGroup tokenGroup) {
                return aSpan;
            }
        }, new QueryScorer(theQuery));

        TopDocs theDocs = indexSearcher.search(theQuery, configuration.getNumberOfSuggestions(), Sort.RELEVANCE);
        for (int i=0;i<theDocs.scoreDocs.length;i++) {
            Document theDocument = indexSearcher.getIndexReader().document(theDocs.scoreDocs[i].doc);
            String theOriginalContent = theDocument.getField(aFieldName).stringValue();

            try {
                for (String theFragment : theHighligher.getBestFragments(analyzer, aFieldName, theOriginalContent, 1)) {
                    // Erstes Token ermitteln
                    int p = theFragment.toLowerCase().indexOf(theTokens.get(0).toLowerCase());
                    if (p>0) {
                        theFragment = theFragment.substring(p).trim();
                    }

                    theResult.add(new Suggestion(highlight(theFragment, theTokens), theFragment));
                }
            } catch (Exception e) {
                LOGGER.error(e);
            }
        }

        return theResult;
    }

    private String highlight(String aPhrase, List<String> aTokens) {
        String theResult = aPhrase;
        Set<String> theTokens = aTokens.stream().map(String::toLowerCase).collect(Collectors.toSet());

        for (String theToken : theTokens) {
            Pattern thePattern = Pattern.compile("(" + theToken+")", Pattern.CASE_INSENSITIVE);
            Matcher theMatcher = thePattern.matcher(aPhrase);
            Set<String> theReplacements = new HashSet<>();
            while(theMatcher.find()) {
                theReplacements.add(theMatcher.group());
            }
            for (String theReplacement : theReplacements) {
                theResult = theResult.replace(theReplacement, "<b>"+theReplacement+"</b>");
            }
        }
        return theResult;
    }

    private String analyze(String aFieldName, String aString) throws IOException {
        TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString);
        theTokenStream.reset();
        CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class);
        try {
            if (theTokenStream.incrementToken()) {
                return theCharTerms.toString();
            }
            return null;
        } finally {
            theTokenStream.end();
            theTokenStream.close();
        }
    }

    private List<String> toTokens(String aFieldName, String aPhrase) throws IOException {
        List<String> theTokens = new ArrayList<>();

        String[] theSplitTokens = StringUtils.split(aPhrase," ,:;?!.");
        for (int i=0;i<theSplitTokens.length;i++) {
            String theToken = theSplitTokens[i];
            // Mutate the last token to a wildcard
            if (theToken.length() > 2 && i == theSplitTokens.length - 1 && !QueryUtils.isWildCard(theToken)) {
                theToken = theToken + QueryUtils.ASTERISK;
            }
            if (!theToken.startsWith("-")) {
                if (QueryUtils.isWildCard(theToken)) {
                    theTokens.add(theToken);
                } else {
                    String theAnalyzed = analyze(aFieldName, theToken);
                    if (theAnalyzed != null) {
                        theTokens.add(theAnalyzed);
                    }
                }
            }
        }

        return theTokens;
    }
}