/**
* FreeDesktopSearch - A Search Engine for your Desktop
* Copyright (C) 2013 Mirko Sertic
*
* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
* warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program; if not, see
* <http://www.gnu.org/licenses/>.
*/
package de.mirkosertic.desktopsearch;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
class QueryParser {
private final Analyzer analyzer;
public QueryParser(Analyzer aAnalyzer) {
analyzer = aAnalyzer;
}
private String toToken(String aToken, String aSearchField) throws IOException {
try (TokenStream theStream = analyzer.tokenStream(aSearchField, aToken)) {
CharTermAttribute theAttribute = theStream.getAttribute(CharTermAttribute.class);
theStream.reset();
if (theStream.incrementToken()) {
return theAttribute.toString();
}
}
return "";
}
private void addToBooleanQuery(List<String> aTermList, String aFieldName, BooleanQuery.Builder aQuery, BooleanClause.Occur aOccour)
throws IOException {
for (String theTerm : aTermList) {
if (QueryUtils.isWildCard(theTerm)) {
aQuery.add(new WildcardQuery(new Term(aFieldName, theTerm)), aOccour);
} else if (QueryUtils.isFuzzy(theTerm)) {
aQuery.add(new FuzzyQuery(new Term(aFieldName, theTerm)), aOccour);
} else {
String theTokenizedTerm = toToken(theTerm, aFieldName);
if (!StringUtils.isEmpty(theTokenizedTerm)) {
aQuery.add(new TermQuery(new Term(aFieldName, theTokenizedTerm)), aOccour);
}
}
}
}
public Query parse(String aQuery, String aSearchField) throws IOException {
QueryTokenizer theTokenizer = new QueryTokenizer(aQuery);
// Now we have the terms, lets construct the query
BooleanQuery.Builder theResult = new BooleanQuery.Builder();
if (!theTokenizer.getRequiredTerms().isEmpty()) {
List<SpanQuery> theSpans = new ArrayList<>();
for (String theTerm : theTokenizer.getRequiredTerms()) {
if (QueryUtils.isWildCard(theTerm)) {
theSpans.add(new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term(aSearchField, theTerm))));
} else if (QueryUtils.isFuzzy(theTerm)) {
theSpans.add(new SpanMultiTermQueryWrapper<>(new FuzzyQuery(new Term(aSearchField, theTerm))));
} else {
// Ok, we need to check of the token would be removed due to stopwords and so on
String theTokenizedTerm = toToken(theTerm, aSearchField);
if (!StringUtils.isEmpty(theTokenizedTerm)) {
theSpans.add(new SpanTermQuery(new Term(aSearchField, theTokenizedTerm)));
}
}
}
if (theSpans.size() > 1) {
// This is the original span, so we boost it a lot
SpanQuery theExactMatchQuery = new SpanNearQuery(theSpans.toArray(new SpanQuery[theSpans.size()]), 0, true);
theResult.add(new BoostQuery(theExactMatchQuery, 61), BooleanClause.Occur.SHOULD);
// We expect a maximum edit distance of 10 between the searched terms in any order
// This seems to be the most useful value
int theMaxEditDistance = 10;
for (int theSlop = 0; theSlop < theMaxEditDistance; theSlop++) {
SpanQuery theNearQuery = new SpanNearQuery(theSpans.toArray(new SpanQuery[theSpans.size()]), theSlop, false);
theResult.add(new BoostQuery(theNearQuery, 50 + theMaxEditDistance - theSlop), BooleanClause.Occur.SHOULD);
}
}
// Finally, we just add simple term queries, but do not boost them
// This makes sure that at least the searched terms
// are found in the document
addToBooleanQuery(theTokenizer.getRequiredTerms(), aSearchField, theResult, BooleanClause.Occur.MUST);
}
// Finally, add the terms that must not occur in the search result
addToBooleanQuery(theTokenizer.getNotRequiredTerms(), aSearchField, theResult, BooleanClause.Occur.MUST_NOT);
return theResult.build();
}
}