/*
* Copyright 2011 Stefan Partusch
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.spartusch.nasfvi.server;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.NullFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.similar.MoreLikeThis;
/**
* Searches the index with a {@link NQuery}.
* @author Stefan Partusch
*
*/
public class NSearcher {
private static final Logger LOGGER =
Logger.getLogger(NSearcher.class.getName());
/** Names of fields to perform similarity searches on. */
private static final String[] SIMILARITY_FIELDS =
new String[]{"titel", "beschreibung"};
/** IndexSearcher that forms the basis of this searcher. */
private final IndexSearcher searcher;
/**
* Creates a new NSearcher based on a Lucene IndexSearcher.
* @param searcher IndexSearcher to form the basis of the new NSearcher
*/
public NSearcher(final IndexSearcher searcher) {
this.searcher = searcher;
}
/**
* Gets the values of fields required for creating an answer in
* natural language.
* @param nquery NQuery used to retrieve <code>result</code>
* @param result Documents matching <code>nquery</code>
* @param offset Index of the first document to process
* @return A mapping from the names of fields to sets of extracted values
* @throws IOException if there is an IOException when accessing the index
*/
public final Map<String, Set<String>> getAnswerValues(final NQuery nquery,
final TopDocs result, int offset) throws IOException {
Map<String, Set<String>> values = new HashMap<String, Set<String>>();
Set<String> answerFields = nquery.getFieldsToAnswer();
if (result.totalHits == 0 || answerFields.size() == 0) {
return values;
}
if (offset >= result.scoreDocs.length) {
offset = result.scoreDocs.length - 1;
}
String firstField = answerFields.iterator().next();
if (answerFields.size() == 1 && !NQuery.isFieldToCollapse(firstField)) {
// Aggregate all ScoreDocs (1 field, n documents)
Set<String> hs = new HashSet<String>(result.scoreDocs.length);
for (ScoreDoc sd : result.scoreDocs) {
Document doc = searcher.doc(sd.doc);
hs.addAll(extractValues(nquery, doc, firstField));
}
values.put(firstField, hs);
} else {
// Process first ScoreDoc only (n fields, 1 document)
Document doc = searcher.doc(result.scoreDocs[offset].doc);
for (String field : answerFields) {
values.put(field, extractValues(nquery, doc, field));
}
}
return values;
}
/**
* Extracts a field's values from a document. This method is aware of
* <i>collapsed</i> or <i>merged</i> fields and handles them properly.
* @param nquery NQuery used for searching
* @param doc Document to extract the field's values from
* @param field Name of the field to extract values for
* @return Set of extracted values
*/
private Set<String> extractValues(final NQuery nquery, final Document doc,
final String field) {
Set<String> values = new HashSet<String>();
if (NQuery.isFieldToCollapse(field)) {
// process merged field
String mfield = NQuery.getMergedField();
QueryScorer scorer = new QueryScorer(nquery.getQuery(), mfield);
Highlighter highlighter = new Highlighter(scorer);
highlighter.setTextFragmenter(new NullFragmenter());
try {
Set<String> buffer = new HashSet<String>();
for (Fieldable f : doc.getFieldables(mfield)) {
String content = f.stringValue();
String value =
normalizeValue(NQuery.extractValue(field, content));
// Test if the field was matched by the query
TokenStream ts = TokenSources.getTokenStream(mfield,
content, nquery.getAnalyzer());
if (highlighter.getBestFragment(ts, content) != null) {
values.add(value);
} else {
// Buffer the value - in case no field matches
buffer.add(value);
}
}
if (values.isEmpty()) {
// No field was matched by the query
values.addAll(buffer);
}
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InvalidTokenOffsetsException e) {
throw new RuntimeException(e);
}
} else {
for (String v : doc.getValues(field)) {
values.add(normalizeValue(v));
}
}
return values;
}
/**
* Normalizes strings. This implementation normalizes whitespace characters.
* @param value String to normalize
* @return Normalized string
*/
private String normalizeValue(final String value) {
return value.replaceAll("\\p{Z}+", " ").trim();
}
/**
* Searches the index using a Lucene query.
* @param query Query to search for
* @param maxHits Maximum number of documents to search for
* @return Matching documents
* @throws IOException if there is an IOException when accessing the index
*/
private TopDocs search(final Query query,
final int maxHits) throws IOException {
TopDocs result = searcher.search(query, maxHits);
LOGGER.info("Search: " + query + ";\t(Hits: " + result.totalHits + ")");
return result;
}
/**
* Searches the index using a {@link NQuery}.
* @param nquery Query to search for
* @param offset Offset to use for the search
* @return Matching documents
* @throws IOException if there is an IOException when accessing the index
*/
public final TopDocs search(final NQuery nquery,
final int offset) throws IOException {
Query q = nquery.getQuery();
if (nquery.hasSimilarityQuery()) {
Query similQuery = nquery.getSimilarityQuery();
TopDocs similDocs = search(similQuery, 1);
if (similDocs.totalHits == 0) {
return new TopDocs(0, new ScoreDoc[0], 0f);
}
int similDocNum = similDocs.scoreDocs[0].doc;
String similId = searcher.doc(similDocNum).get("id");
Query exclude = new TermQuery(new Term("id", similId));
// exclude the document compared to
MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
mlt.setFieldNames(SIMILARITY_FIELDS);
Query moreLikeQuery = mlt.like(similDocNum);
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(q, BooleanClause.Occur.MUST);
booleanQuery.add(moreLikeQuery, BooleanClause.Occur.MUST);
booleanQuery.add(exclude, BooleanClause.Occur.MUST_NOT);
q = booleanQuery;
}
return search(q, offset + 5); // return top 5 results
}
/**
* Creates a JSON representation of a {@link NQuery} and the results
* of a search.
* @param nquery NQuery to include in the JSON representation
* @param result Search results to include
* @param offset Offset to include
* @return JSON representation including <code>nquery</code>,
* <code>result</code> and <code>offset</code>
*/
public final String toJson(final NQuery nquery, final TopDocs result,
final int offset) {
StringBuilder sb = new StringBuilder();
// {
// "NQuery": nquery,
// "Offset": offset,
// "Hits": totalHits
// }
sb.append("{\n\"NQuery\": ").append(nquery.toString()).append(",\n");
sb.append("\"Offset\": ").append(offset).append(",\n");
sb.append("\"Hits\": ").append(result.totalHits).append("\n}");
return sb.toString();
}
}