NSearcher.java example

/*
 * Copyright 2011 Stefan Partusch
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.spartusch.nasfvi.server;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.NullFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.similar.MoreLikeThis;

/**
 * Searches the index with a {@link NQuery}.
 * @author Stefan Partusch
 *
 */
public class NSearcher {
	private static final Logger LOGGER =
		Logger.getLogger(NSearcher.class.getName());

	/** Names of fields to perform similarity searches on. */
	private static final String[] SIMILARITY_FIELDS =
		new String[]{"titel", "beschreibung"};
	/** IndexSearcher that forms the basis of this searcher. */
	private final IndexSearcher searcher;

	/**
	 * Creates a new NSearcher based on a Lucene IndexSearcher.
	 * @param searcher IndexSearcher to form the basis of the new NSearcher
	 */
	public NSearcher(final IndexSearcher searcher) {
		this.searcher = searcher;
	}

	/**
	 * Gets the values of fields required for creating an answer in
	 * natural language.
	 * @param nquery NQuery used to retrieve <code>result</code>
	 * @param result Documents matching <code>nquery</code>
	 * @param offset Index of the first document to process
	 * @return A mapping from the names of fields to sets of extracted values
	 * @throws IOException if there is an IOException when accessing the index
	 */
	public final Map<String, Set<String>> getAnswerValues(final NQuery nquery,
			final TopDocs result, int offset) throws IOException {
		Map<String, Set<String>> values = new HashMap<String, Set<String>>();
		Set<String> answerFields = nquery.getFieldsToAnswer();

		if (result.totalHits == 0 || answerFields.size() == 0) {
			return values;
		}
		
		if (offset >= result.scoreDocs.length) {
			offset = result.scoreDocs.length - 1;
		}

		String firstField = answerFields.iterator().next();

		if (answerFields.size() == 1 && !NQuery.isFieldToCollapse(firstField)) {
			// Aggregate all ScoreDocs (1 field, n documents)
			Set<String> hs = new HashSet<String>(result.scoreDocs.length);
			for (ScoreDoc sd : result.scoreDocs) {
				Document doc = searcher.doc(sd.doc);
				hs.addAll(extractValues(nquery, doc, firstField));
			}
			values.put(firstField, hs);
		} else {
			// Process first ScoreDoc only (n fields, 1 document)
			Document doc = searcher.doc(result.scoreDocs[offset].doc);
			for (String field : answerFields) {
				values.put(field, extractValues(nquery, doc, field));
			}
		}

		return values;
	}

	/**
	 * Extracts a field's values from a document. This method is aware of
	 * <i>collapsed</i> or <i>merged</i> fields and handles them properly. 
	 * @param nquery NQuery used for searching
	 * @param doc Document to extract the field's values from
	 * @param field Name of the field to extract values for
	 * @return Set of extracted values
	 */
	private Set<String> extractValues(final NQuery nquery, final Document doc,
			final String field) {
		Set<String> values = new HashSet<String>();

		if (NQuery.isFieldToCollapse(field)) {
			// process merged field
			String mfield = NQuery.getMergedField();
			QueryScorer scorer = new QueryScorer(nquery.getQuery(), mfield);
			Highlighter highlighter = new Highlighter(scorer);
			highlighter.setTextFragmenter(new NullFragmenter());

			try {
				Set<String> buffer = new HashSet<String>();

				for (Fieldable f : doc.getFieldables(mfield)) {
					String content = f.stringValue();
					String value =
						normalizeValue(NQuery.extractValue(field, content));

					// Test if the field was matched by the query
					TokenStream ts = TokenSources.getTokenStream(mfield,
							content, nquery.getAnalyzer());
					if (highlighter.getBestFragment(ts, content) != null) {
						values.add(value); 
					} else {
						// Buffer the value - in case no field matches
						buffer.add(value);
					}
				}
				
				if (values.isEmpty()) {
					// No field was matched by the query
					values.addAll(buffer);
				}
			} catch (IOException e) {
				throw new RuntimeException(e);
			} catch (InvalidTokenOffsetsException e) {
				throw new RuntimeException(e);
			}
		} else {
			for (String v : doc.getValues(field)) {
				values.add(normalizeValue(v));
			}
		}

		return values;
	}

	/**
	 * Normalizes strings. This implementation normalizes whitespace characters.
	 * @param value String to normalize
	 * @return Normalized string
	 */
	private String normalizeValue(final String value) {
		return value.replaceAll("\\p{Z}+", " ").trim();
	}

	/**
	 * Searches the index using a Lucene query.
	 * @param query Query to search for
	 * @param maxHits Maximum number of documents to search for
	 * @return Matching documents
	 * @throws IOException if there is an IOException when accessing the index
	 */
	private TopDocs search(final Query query,
			final int maxHits) throws IOException {
		TopDocs result = searcher.search(query, maxHits);
		LOGGER.info("Search: " + query + ";\t(Hits: " + result.totalHits + ")");
		return result;
	}

	/**
	 * Searches the index using a {@link NQuery}.
	 * @param nquery Query to search for
	 * @param offset Offset to use for the search
	 * @return Matching documents
	 * @throws IOException if there is an IOException when accessing the index
	 */
	public final TopDocs search(final NQuery nquery,
			final int offset) throws IOException {
		Query q = nquery.getQuery();

		if (nquery.hasSimilarityQuery()) {
			Query similQuery = nquery.getSimilarityQuery();
			TopDocs similDocs = search(similQuery, 1);

			if (similDocs.totalHits == 0) {
				return new TopDocs(0, new ScoreDoc[0], 0f);
			}

			int similDocNum = similDocs.scoreDocs[0].doc;
			String similId = searcher.doc(similDocNum).get("id");
			Query exclude = new TermQuery(new Term("id", similId));
			// exclude the document compared to

			MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
			mlt.setFieldNames(SIMILARITY_FIELDS);
			Query moreLikeQuery = mlt.like(similDocNum);

			BooleanQuery booleanQuery = new BooleanQuery();
			booleanQuery.add(q, BooleanClause.Occur.MUST);
			booleanQuery.add(moreLikeQuery, BooleanClause.Occur.MUST);
			booleanQuery.add(exclude, BooleanClause.Occur.MUST_NOT);

			q = booleanQuery;
		}

		return search(q, offset + 5); // return top 5 results
	}

	/**
	 * Creates a JSON representation of a {@link NQuery} and the results
	 * of a search.
	 * @param nquery NQuery to include in the JSON representation
	 * @param result Search results to include
	 * @param offset Offset to include
	 * @return JSON representation including <code>nquery</code>,
	 * <code>result</code> and <code>offset</code>
	 */
	public final String toJson(final NQuery nquery, final TopDocs result,
			final int offset) {
		StringBuilder sb = new StringBuilder();
		// {
		// "NQuery": nquery,
		// "Offset": offset,
		// "Hits": totalHits
		// }
		
		sb.append("{\n\"NQuery\": ").append(nquery.toString()).append(",\n");
		sb.append("\"Offset\": ").append(offset).append(",\n");
		sb.append("\"Hits\": ").append(result.totalHits).append("\n}");

		return sb.toString();
	}
}