/* * Copyright (c) 2009-2010 Lockheed Martin Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.eurekastreams.commons.search.explanation; import java.io.IOException; import java.io.StringReader; import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.eurekastreams.commons.search.modelview.FieldMatch; /** * Class to determine how a search result matched. */ public class FieldMatchDeterminer { /** * Instance of the logger. */ private Log log = LogFactory.getLog(FieldMatchDeterminer.class); /** * The fields to analyze. */ private List<String> fieldsToAnalyze; /** * The analyzer to use to parse the query string. */ private Analyzer searchAnalyzer; /** * Set the field names to analyze. * * @param inFieldsToAnalyze * the field names to analyze. */ public void setFieldsToAnalyze(final List<String> inFieldsToAnalyze) { fieldsToAnalyze = inFieldsToAnalyze; } /** * Set the search analyzer to use to parse the query. * * @param inSearchAnalyzer * the search analyzer used to parse the query */ public void setSearchAnalyzer(final Analyzer inSearchAnalyzer) { searchAnalyzer = inSearchAnalyzer; } /** * Parse the input Explanation string to find which of the input search keywords matched with the input Analyzer. * * @param inExplanationText * the Explanation text returned from the search * @param searchText * the search string the user typed * @return A Map with the keys representing the fields to analyze and the values as a list of keywords that the user * typed as search parameters that matched the corresponding keyword. */ public FieldMatch determineFieldMatches(final String inExplanationText, final String searchText) { String explanationText = inExplanationText; FieldMatch matchedKeywords = new FieldMatch(); if (fieldsToAnalyze.size() == 0) { return matchedKeywords; } log.debug("Explanation:" + explanationText); // Remove the boost values, makes things easier... Pattern boostPattern = Pattern.compile("\\^[0-9]+.[0-9]+"); Matcher boostPatternMatcher = boostPattern.matcher(explanationText); explanationText = boostPatternMatcher.replaceAll(""); // convert the keywords to the analyzed form, then store them in a hashtable of <tokenizedForm, originalKeyword> Map<String, String> tokenizedKeywords = tokenizeKeywords(searchText); // We now have a Map with the tokenized keyword as the key, the original search word as the value. // Start looking through the explanation for the values for (String fieldName : fieldsToAnalyze) { Pattern weightPattern = Pattern.compile("\\sweight\\(" + fieldName + ":(\\w+)\\s", java.util.regex.Pattern.CASE_INSENSITIVE | java.util.regex.Pattern.MULTILINE); Matcher m = weightPattern.matcher(explanationText); boolean result = m.find(); while (result) { matchedKeywords.addMatch(fieldName, tokenizedKeywords.get(m.group(1))); result = m.find(); } } return matchedKeywords; } /** * Tokenize the input search text using the passed-in Analyzer. * * @param searchText * the search text to parse * @return a Map of tokenized-term -> original term */ private Map<String, String> tokenizeKeywords(final String searchText) { Map<String, String> tokenizedKeywords = new Hashtable<String, String>(); String[] keywords = searchText.split(" "); TokenStream stream; for (String keyword : keywords) { stream = searchAnalyzer.tokenStream(null, new StringReader(keyword)); Token token = new Token(); try { while ((token = stream.next(token)) != null) { String tokenizedKeyword = token.term(); log.info("Tokenized keyword: " + tokenizedKeyword); tokenizedKeywords.put(tokenizedKeyword, keyword); } } catch (IOException e) { // do nothing, see if we can continue log.error("Error tokenizing the search keyword for explanation: " + keyword, e); } } return tokenizedKeywords; } }