/* * Copyright (c) 2011 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.flaptor.indextank.search; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.log4j.Logger; import com.flaptor.indextank.index.Document; import com.flaptor.indextank.query.AToken; import com.flaptor.indextank.query.IndexEngineParser; import com.flaptor.indextank.query.Query; import com.flaptor.indextank.query.TermQuery; import com.flaptor.indextank.storage.alternatives.DocumentStorage; import com.flaptor.indextank.util.CharacterTranslator; import com.flaptor.util.Execute; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Sets; public class SnippetSearcher extends AbstractDocumentSearcher { private static final Logger logger = Logger.getLogger(Execute.whoAmI()); private final DocumentSearcher delegate; private final DocumentStorage storage; private final IndexEngineParser parser; private final Map<SnippeterType, Snippeter> snippeters; public enum SnippeterType { HTML_AWARE, LINE_AWARE } public SnippetSearcher(DocumentSearcher searcher, DocumentStorage storage, IndexEngineParser parser){ Preconditions.checkNotNull(searcher); Preconditions.checkNotNull(storage); this.delegate = searcher; this.storage = storage; this.parser = parser; this.snippeters = ImmutableMap.of( SnippeterType.HTML_AWARE, new HtmlAwareSnippeter(), SnippeterType.LINE_AWARE, new LineAwareSnippeter()); } /** * @see AbstractDocumentSearcher#search(Query query, int start, int limit, int scoringFunctionIndex, Map<String, String> extraParameters). * * @param extraParameters: It will process 'fetch_fields', 'snippet_fields' and 'snippet_type'. * 'fetch_fields' and 'snippet_fields' are comma-separated lists of field names to fetch an snippet. * 'snippet_type' can be either 'html' or 'lines'. 'html' is the default. * */ @Override public SearchResults search(Query query, int start, int limit, int scoringFunctionIndex, Map<String, String> extraParameters) throws InterruptedException { // call delegate searcher SearchResults results = this.delegate.search(query, start, limit, scoringFunctionIndex, extraParameters); long startTime = System.currentTimeMillis(); String[] fetchFields = parseFields(extraParameters, "fetch"); String[] snippetFields = parseFields(extraParameters, "snippet"); Set<TermQuery> positiveTerms = query.getRoot().getPositiveTerms(); // find out which snippeter type is the right one for this query String snType = extraParameters.get("snippet_type"); Snippeter sn = null; if (null == snType || "html".equalsIgnoreCase(snType)) { sn = this.snippeters.get(SnippeterType.HTML_AWARE); } else if ("lines".equalsIgnoreCase(snType)) { sn = this.snippeters.get(SnippeterType.LINE_AWARE); } else { throw new IllegalArgumentException("snippet_type has to be either 'html' or 'lines'"); } if (fetchFields.length + snippetFields.length > 0) { for (SearchResult result : results.getResults()) { Document data = storage.getDocument(result.getDocId()); // fetch fields for (String field : fetchFields) { // handle '*', as a fetch all if ("*".equals(field.trim())){ // assume we get the actual fields, not a copy. result.getFields().putAll(data.asMap()); break; } String text = data.getField(field); if (null != text) { result.setField(field, text); } } // snippet fields for (String field : snippetFields) { String text = data.getField(field); if (null != text) { result.setField("snippet_" + field, sn.snippet(positiveTerms, field, text)); } } } } long endTime = System.currentTimeMillis(); logger.debug("(search) fetching & snippeting took: " + (endTime - startTime) + " ms."); return results; } @Override public int countMatches(Query query) throws InterruptedException { return this.delegate.countMatches(query); } private static String[] parseFields(Map<String, String> extraParameters, String key) { if (extraParameters.containsKey(key + "_fields")) { return extraParameters.get(key + "_fields").split(","); } else { return new String[0]; } } private abstract class Snippeter { protected abstract int adjustStart(int position, String text); protected abstract int adjustEnd(int position, String text); private String snippet(Set<TermQuery> terms, String fieldName, String text) { Set<String> termsForField = getTermsForField(terms, fieldName); long t1 = System.currentTimeMillis(); List<AToken> tokens = Lists.newArrayList(parser.parseDocumentField(fieldName, text)); long t2 = System.currentTimeMillis(); logger.debug(String.format("Parsing field %s took %d ms.", fieldName, t2 - t1)); List<Integer> matches = Lists.newArrayList(); for (int i = 0; i < tokens.size(); i++) { String termInText = tokens.get(i).getText(); for (String termInQuery : termsForField) { if ((termInQuery.endsWith("*") && termInText.startsWith(termInQuery.substring(0, termInQuery.length() - 1))) || termInQuery.equals(termInText)) { matches.add(i); } } } if (matches.size() == 0) { return ""; } Window window = findBestWindow(tokens, matches, 200); long t3 = System.currentTimeMillis(); logger.debug(String.format("Finding best window for %d matches took %d ms.", matches.size(), t3 - t2)); String markedText = mark(window, text); long t4 = System.currentTimeMillis(); logger.debug(String.format("Marking text %d chars in %d ms.", markedText.length(), t4 - t3)); return markedText; } private String mark(Window window, String text) { Preconditions.checkArgument(!window.matches.isEmpty(), "Cannot mark an empty window"); StringBuilder buff = new StringBuilder(500); String open = "<b>"; String close = "</b>"; int current = window.start; // let subclasses handle where snippets start current = adjustStart(current, text); for (AToken token : window.matches) { escapeAndAppend(buff, text, current, token.getStartOffset()); buff.append(open); int start = token.getStartOffset(); int endOffset = token.getEndOffset(); escapeAndAppend(buff, text, start, endOffset); buff.append(close); current = endOffset; } // let subclasses handle where snippets end int finish = window.end; finish = adjustEnd(finish, text); escapeAndAppend(buff, text, current, finish); return buff.toString(); } private Window findBestWindow(List<AToken> tokens, List<Integer> matches, int maxSize) { if (matches.size() == 0) { return null; } List<AToken> mtokens = asTokens(matches, tokens); List<Integer> best = null; float bestScore = 0f; int left = 0; int right = 0; while (right < matches.size()) { right++; while (mtokens.get(right - 1).getEndOffset() - mtokens.get(left).getStartOffset() > maxSize) { left++; } List<AToken> candidate = mtokens.subList(left, right); float score = scoreWindow(candidate); if (score > bestScore) { bestScore = score; best = matches.subList(left, right); } } return getWindowContext(tokens, best); } private Window getWindowContext(List<AToken> tokens, List<Integer> best) { int left = best.get(0); int right = best.get(best.size()-1); Window window = new Window(); window.matches = asTokens(best, tokens); window.start = tokens.get(Math.max(0, left - 5)).getStartOffset(); window.end = tokens.get(Math.min(right + 24, tokens.size()-1)).getEndOffset(); return window; } private float scoreWindow(List<AToken> candidate) { Set<String> terms = Sets.newHashSet(); for (AToken token : candidate) { terms.add(token.getText()); } return candidate.size() * terms.size() * terms.size(); } } /** * A Snippeter that tries not to cut HTML entities */ private class HtmlAwareSnippeter extends Snippeter { // Snippeter abstract methods protected int adjustStart(int position, String text) { // tokenizers may cut off & on entities .. fix that if (position > 0 && text.charAt(position -1) == '&') { return position -1; } return position; } protected int adjustEnd(int position, String text){ // tokenizers miss final ; on entities. Try to fix that return position; } } /** * A Snippeter that returns complete lines. */ private class LineAwareSnippeter extends Snippeter { // Snippeter abstract methods protected int adjustEnd(int finish, String text) { while (finish < text.length() && text.charAt(finish) != '\n') { finish++; } if (finish < text.length()) { // loop above ended because of text.charAt .. // return the endline finish++; } return finish; } protected int adjustStart(int current, String text) { while (current > 0 && text.charAt(current-1) != '\n') { current--; } return current; } } private List<AToken> asTokens(List<Integer> matches, final List<AToken> tokens) { return Lists.transform(matches, new Function<Integer, AToken>() { @Override public AToken apply(Integer pair) { return tokens.get(pair); } }); } private static class Window { int start; int end; List<AToken> matches = Lists.newArrayList(); } private Set<String> getTermsForField(Set<TermQuery> terms, String fieldName) { Set<String> retval = new HashSet<String>(); for (TermQuery t : terms) { if (t.getField().equals(fieldName)) { retval.add(t.getTerm()); } } return retval; } private void escapeAndAppend(StringBuilder dest, String str, int start, int offset) { if (dest == null ) { throw new IllegalArgumentException ("The Writer must not be null."); } if (str == null) { return; } CharacterTranslator.HTML4.escape(dest, str, start, offset); } }