package org.apache.lucene.search.suggest.analyzing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.codecs.lucene46.Lucene46Codec;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FilterAtomicReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.sorter.EarlyTerminatingSortingCollector;
import org.apache.lucene.index.sorter.SortingMergePolicy;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.Version;
// TODO:
// - a PostingsFormat that stores super-high-freq terms as
// a bitset should be a win for the prefix terms?
// (LUCENE-5052)
// - we could offer a better integration with
// DocumentDictionary and NRT? so that your suggester
// "automatically" keeps in sync w/ your index
/** Analyzes the input text and then suggests matches based
* on prefix matches to any tokens in the indexed text.
* This also highlights the tokens that match.
*
* <p>This just uses an ordinary Lucene index. It
* supports payloads, and records these as a
* {@link BinaryDocValues} field. Matches are sorted only
* by the suggest weight; it would be nice to support
* blended score + weight sort in the future. This means
* this suggester best applies when there is a strong
* apriori ranking of all the suggestions. */
public class AnalyzingInfixSuggester extends Lookup implements Closeable {
/** Field name used for the indexed text. */
protected final static String TEXT_FIELD_NAME = "text";
/** Field name used for the indexed text, as a
* StringField, for exact lookup. */
protected final static String EXACT_TEXT_FIELD_NAME = "exacttext";
/** Analyzer used at search time */
protected final Analyzer queryAnalyzer;
/** Analyzer used at index time */
protected final Analyzer indexAnalyzer;
final Version matchVersion;
private final Directory dir;
final int minPrefixChars;
/** Used for ongoing NRT additions/updates. */
private IndexWriter writer;
/** {@link IndexSearcher} used for lookups. */
protected SearcherManager searcherMgr;
/** Default minimum number of leading characters before
* PrefixQuery is used (4). */
public static final int DEFAULT_MIN_PREFIX_CHARS = 4;
/** How we sort the postings and search results. */
private static final Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true));
/** Create a new instance, loading from a previously built
* directory, if it exists. Note that {@link #close}
* will also close the provided directory. */
public AnalyzingInfixSuggester(Version matchVersion, Directory dir, Analyzer analyzer) throws IOException {
this(matchVersion, dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS);
}
/** Create a new instance, loading from a previously built
* directory, if it exists. Note that {@link #close}
* will also close the provided directory.
*
* @param minPrefixChars Minimum number of leading characters
* before PrefixQuery is used (default 4).
* Prefixes shorter than this are indexed as character
* ngrams (increasing index size but making lookups
* faster).
*/
public AnalyzingInfixSuggester(Version matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars) throws IOException {
if (minPrefixChars < 0) {
throw new IllegalArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars);
}
this.queryAnalyzer = queryAnalyzer;
this.indexAnalyzer = indexAnalyzer;
this.matchVersion = matchVersion;
this.dir = dir;
this.minPrefixChars = minPrefixChars;
if (DirectoryReader.indexExists(dir)) {
// Already built; open it:
writer = new IndexWriter(dir,
getIndexWriterConfig(matchVersion, getGramAnalyzer(), IndexWriterConfig.OpenMode.APPEND));
searcherMgr = new SearcherManager(writer, true, null);
}
}
/** Override this to customize index settings, e.g. which
* codec to use. */
protected IndexWriterConfig getIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) {
IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer);
iwc.setCodec(new Lucene46Codec());
iwc.setOpenMode(openMode);
// This way all merged segments will be sorted at
// merge time, allow for per-segment early termination
// when those segments are searched:
iwc.setMergePolicy(new SortingMergePolicy(iwc.getMergePolicy(), SORT));
return iwc;
}
/** Subclass can override to choose a specific {@link
* Directory} implementation. */
protected Directory getDirectory(File path) throws IOException {
return FSDirectory.open(path);
}
@Override
public void build(InputIterator iter) throws IOException {
if (searcherMgr != null) {
searcherMgr.close();
searcherMgr = null;
}
if (writer != null) {
writer.close();
writer = null;
}
AtomicReader r = null;
boolean success = false;
try {
// First pass: build a temporary normal Lucene index,
// just indexing the suggestions as they iterate:
writer = new IndexWriter(dir,
getIndexWriterConfig(matchVersion, getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE));
BytesRef text;
Document doc = new Document();
FieldType ft = getTextFieldType();
Field textField = new Field(TEXT_FIELD_NAME, "", ft);
doc.add(textField);
Field textGramField = new Field("textgrams", "", ft);
doc.add(textGramField);
Field exactTextField = new StringField(EXACT_TEXT_FIELD_NAME, "", Field.Store.NO);
doc.add(exactTextField);
Field textDVField = new BinaryDocValuesField(TEXT_FIELD_NAME, new BytesRef());
doc.add(textDVField);
// TODO: use threads...?
Field weightField = new NumericDocValuesField("weight", 0L);
doc.add(weightField);
Field payloadField;
if (iter.hasPayloads()) {
payloadField = new BinaryDocValuesField("payloads", new BytesRef());
doc.add(payloadField);
} else {
payloadField = null;
}
//long t0 = System.nanoTime();
while ((text = iter.next()) != null) {
String textString = text.utf8ToString();
textField.setStringValue(textString);
exactTextField.setStringValue(textString);
textGramField.setStringValue(textString);
textDVField.setBytesValue(text);
weightField.setLongValue(iter.weight());
if (iter.hasPayloads()) {
payloadField.setBytesValue(iter.payload());
}
writer.addDocument(doc);
}
//System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec");
searcherMgr = new SearcherManager(writer, true, null);
success = true;
} finally {
if (success) {
IOUtils.close(r);
} else {
IOUtils.closeWhileHandlingException(writer, r);
writer = null;
}
}
}
private Analyzer getGramAnalyzer() {
return new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return indexAnalyzer;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
if (fieldName.equals("textgrams") && minPrefixChars > 0) {
return new TokenStreamComponents(components.getTokenizer(),
new EdgeNGramTokenFilter(matchVersion,
components.getTokenStream(),
1, minPrefixChars));
} else {
return components;
}
}
};
}
/** Adds a new suggestion. Be sure to use {@link #update}
* instead if you want to replace a previous suggestion.
* After adding or updating a batch of new suggestions,
* you must call {@link #refresh} in the end in order to
* see the suggestions in {@link #lookup} */
public void add(BytesRef text, long weight, BytesRef payload) throws IOException {
String textString = text.utf8ToString();
Document doc = new Document();
FieldType ft = getTextFieldType();
doc.add(new Field(TEXT_FIELD_NAME, textString, ft));
doc.add(new Field("textgrams", textString, ft));
doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO));
doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text));
doc.add(new NumericDocValuesField("weight", weight));
if (payload != null) {
doc.add(new BinaryDocValuesField("payloads", payload));
}
writer.addDocument(doc);
}
/** Updates a previous suggestion, matching the exact same
* text as before. Use this to change the weight or
* payload of an already added suggstion. If you know
* this text is not already present you can use {@link
* #add} instead. After adding or updating a batch of
* new suggestions, you must call {@link #refresh} in the
* end in order to see the suggestions in {@link #lookup} */
public void update(BytesRef text, long weight, BytesRef payload) throws IOException {
String textString = text.utf8ToString();
Document doc = new Document();
FieldType ft = getTextFieldType();
doc.add(new Field(TEXT_FIELD_NAME, textString, ft));
doc.add(new Field("textgrams", textString, ft));
doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO));
doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text));
doc.add(new NumericDocValuesField("weight", weight));
if (payload != null) {
doc.add(new BinaryDocValuesField("payloads", payload));
}
writer.updateDocument(new Term(EXACT_TEXT_FIELD_NAME, textString), doc);
}
/** Reopens the underlying searcher; it's best to "batch
* up" many additions/updates, and then call refresh
* once in the end. */
public void refresh() throws IOException {
searcherMgr.maybeRefreshBlocking();
}
/**
* Subclass can override this method to change the field type of the text field
* e.g. to change the index options
*/
protected FieldType getTextFieldType(){
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_ONLY);
ft.setOmitNorms(true);
return ft;
}
@Override
public List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num) throws IOException {
return lookup(key, num, true, true);
}
/** This is called if the last token isn't ended
* (e.g. user did not type a space after it). Return an
* appropriate Query clause to add to the BooleanQuery. */
protected Query getLastTokenQuery(String token) throws IOException {
if (token.length() < minPrefixChars) {
// The leading ngram was directly indexed:
return new TermQuery(new Term("textgrams", token));
}
return new PrefixQuery(new Term(TEXT_FIELD_NAME, token));
}
/** Retrieve suggestions, specifying whether all terms
* must match ({@code allTermsRequired}) and whether the hits
* should be highlighted ({@code doHighlight}). */
public List<LookupResult> lookup(CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
if (searcherMgr == null) {
throw new IllegalStateException("suggester was not built");
}
final BooleanClause.Occur occur;
if (allTermsRequired) {
occur = BooleanClause.Occur.MUST;
} else {
occur = BooleanClause.Occur.SHOULD;
}
BooleanQuery query;
Set<String> matchedTokens = new HashSet<>();
String prefixToken = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
//long t0 = System.currentTimeMillis();
ts.reset();
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
String lastToken = null;
query = new BooleanQuery();
int maxEndOffset = -1;
matchedTokens = new HashSet<>();
while (ts.incrementToken()) {
if (lastToken != null) {
matchedTokens.add(lastToken);
query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
}
lastToken = termAtt.toString();
if (lastToken != null) {
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
}
}
ts.end();
if (lastToken != null) {
Query lastQuery;
if (maxEndOffset == offsetAtt.endOffset()) {
// Use PrefixQuery (or the ngram equivalent) when
// there was no trailing discarded chars in the
// string (e.g. whitespace), so that if query does
// not end with a space we show prefix matches for
// that token:
lastQuery = getLastTokenQuery(lastToken);
prefixToken = lastToken;
} else {
// Use TermQuery for an exact match if there were
// trailing discarded chars (e.g. whitespace), so
// that if query ends with a space we only show
// exact matches for that term:
matchedTokens.add(lastToken);
lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
}
if (lastQuery != null) {
query.add(lastQuery, occur);
}
}
}
// TODO: we could allow blended sort here, combining
// weight w/ score. Now we ignore score and sort only
// by weight:
//System.out.println("INFIX query=" + query);
Query finalQuery = finishQuery(query, allTermsRequired);
//System.out.println("finalQuery=" + query);
// Sort by weight, descending:
TopFieldCollector c = TopFieldCollector.create(SORT, num, true, false, false, false);
// We sorted postings by weight during indexing, so we
// only retrieve the first num hits now:
Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num);
IndexSearcher searcher = searcherMgr.acquire();
List<LookupResult> results = null;
try {
//System.out.println("got searcher=" + searcher);
searcher.search(finalQuery, c2);
TopFieldDocs hits = (TopFieldDocs) c.topDocs();
// Slower way if postings are not pre-sorted by weight:
// hits = searcher.search(query, null, num, SORT);
results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
} finally {
searcherMgr.release(searcher);
}
//System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
//System.out.println(results);
return results;
}
/**
* Create the results based on the search hits.
* Can be overridden by subclass to add particular behavior (e.g. weight transformation)
* @throws IOException If there are problems reading fields from the underlying Lucene index.
*/
protected List<LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits, int num,
CharSequence charSequence,
boolean doHighlight, Set<String> matchedTokens, String prefixToken)
throws IOException {
BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);
// This will just be null if app didn't pass payloads to build():
// TODO: maybe just stored fields? they compress...
BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");
List<LookupResult> results = new ArrayList<>();
BytesRef scratch = new BytesRef();
for (int i=0;i<hits.scoreDocs.length;i++) {
FieldDoc fd = (FieldDoc) hits.scoreDocs[i];
textDV.get(fd.doc, scratch);
String text = scratch.utf8ToString();
long score = (Long) fd.fields[0];
BytesRef payload;
if (payloadsDV != null) {
payload = new BytesRef();
payloadsDV.get(fd.doc, payload);
} else {
payload = null;
}
LookupResult result;
if (doHighlight) {
Object highlightKey = highlight(text, matchedTokens, prefixToken);
result = new LookupResult(highlightKey.toString(), highlightKey, score, payload);
} else {
result = new LookupResult(text, score, payload);
}
results.add(result);
}
return results;
}
/** Subclass can override this to tweak the Query before
* searching. */
protected Query finishQuery(BooleanQuery in, boolean allTermsRequired) {
return in;
}
/** Override this method to customize the Object
* representing a single highlighted suggestions; the
* result is set on each {@link
* LookupResult#highlightKey} member. */
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
StringBuilder sb = new StringBuilder();
int upto = 0;
while (ts.incrementToken()) {
String token = termAtt.toString();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
addNonMatch(sb, text.substring(upto, startOffset));
upto = startOffset;
} else if (upto > startOffset) {
continue;
}
if (matchedTokens.contains(token)) {
// Token matches.
addWholeMatch(sb, text.substring(startOffset, endOffset), token);
upto = endOffset;
} else if (prefixToken != null && token.startsWith(prefixToken)) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
upto = endOffset;
}
}
ts.end();
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
addNonMatch(sb, text.substring(upto));
}
return sb.toString();
}
}
/** Called while highlighting a single result, to append a
* non-matching chunk of text from the suggestion to the
* provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param text The text chunk to add
*/
protected void addNonMatch(StringBuilder sb, String text) {
sb.append(text);
}
/** Called while highlighting a single result, to append
* the whole matched token to the provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param surface The surface form (original) text
* @param analyzed The analyzed token corresponding to the surface form text
*/
protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
sb.append("<b>");
sb.append(surface);
sb.append("</b>");
}
/** Called while highlighting a single result, to append a
* matched prefix token, to the provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param surface The fragment of the surface form
* (indexed during {@link #build}, corresponding to
* this match
* @param analyzed The analyzed token that matched
* @param prefixToken The prefix of the token that matched
*/
protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) {
// TODO: apps can try to invert their analysis logic
// here, e.g. downcase the two before checking prefix:
sb.append("<b>");
sb.append(surface.substring(0, prefixToken.length()));
sb.append("</b>");
if (prefixToken.length() < surface.length()) {
sb.append(surface.substring(prefixToken.length()));
}
}
@Override
public boolean store(DataOutput in) throws IOException {
return false;
}
@Override
public boolean load(DataInput out) throws IOException {
return false;
}
@Override
public void close() throws IOException {
if (searcherMgr != null) {
searcherMgr.close();
searcherMgr = null;
}
if (writer != null) {
writer.close();
dir.close();
writer = null;
}
}
@Override
public long sizeInBytes() {
long mem = RamUsageEstimator.shallowSizeOf(this);
try {
if (searcherMgr != null) {
IndexSearcher searcher = searcherMgr.acquire();
try {
for (AtomicReaderContext context : searcher.getIndexReader().leaves()) {
AtomicReader reader = FilterAtomicReader.unwrap(context.reader());
if (reader instanceof SegmentReader) {
mem += ((SegmentReader) context.reader()).ramBytesUsed();
}
}
} finally {
searcherMgr.release(searcher);
}
}
return mem;
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
@Override
public long getCount() throws IOException {
IndexSearcher searcher = searcherMgr.acquire();
try {
return searcher.getIndexReader().numDocs();
} finally {
searcherMgr.release(searcher);
}
}
};