package org.talend.dataquality.semantic.index; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; /** * Created by jteuladedenantes on 16/11/16. */ public abstract class AbstractDictionarySearcher { private static final Logger LOGGER = Logger.getLogger(AbstractDictionarySearcher.class); public static final String F_ID = "docid";//$NON-NLS-1$ public static final String F_WORD = "word";//$NON-NLS-1$ public static final String F_SYNTERM = "synterm";//$NON-NLS-1$ public static final String F_RAW = "raw"; public static final String F_CATID = "catid"; protected int topDocLimit = 3; private int maxEdits = 2; // Default value private static final int MAX_TOKEN_COUNT_FOR_KEYWORD_MATCH = 20; private static final int MAX_CHAR_COUNT_FOR_DICTIONARY_MATCH = 100; protected DictionarySearchMode searchMode = DictionarySearchMode.MATCH_SEMANTIC_DICTIONARY; public abstract TopDocs searchDocumentBySynonym(String stringToSearch) throws IOException; public abstract Document getDocument(int docNum); /** * Method "setTopDocLimit" set the maximum number of documents to return after a search. * * @param topDocLimit the limit */ public void setTopDocLimit(int topDocLimit) { this.topDocLimit = topDocLimit; } public void setMaxEdits(int maxEdits) { this.maxEdits = maxEdits; } public DictionarySearchMode getSearchMode() { return searchMode; } public void setSearchMode(DictionarySearchMode searchMode) { this.searchMode = searchMode; } private Query getTermQuery(String field, String text, boolean fuzzy) { Term term = new Term(field, text); return fuzzy ? new FuzzyQuery(term, maxEdits) : new TermQuery(term); } /** * @param input * @return * @throws IOException */ protected Query createQueryForSemanticDictionaryMatch(String input) throws IOException { // for dictionary search, ignore searching for input containing too many tokens if (input.length() > MAX_CHAR_COUNT_FOR_DICTIONARY_MATCH) { return new TermQuery(new Term(F_SYNTERM, StringUtils.EMPTY)); } return getTermQuery(F_SYNTERM, StringUtils.join(getTokensFromAnalyzer(input), ' '), false); } /** * * * @param input * @return * @throws IOException */ protected Query createQueryForSemanticKeywordMatch(String input) throws IOException { BooleanQuery booleanQuery = new BooleanQuery(); List<String> tokens = getTokensFromAnalyzer(input); // for keyword search, when the token count exceeds MAX_TOKEN_COUNT_FOR_KEYWORD_MATCH, only search the beginning // tokens from input for (int i = 0; i < Math.min(tokens.size(), MAX_TOKEN_COUNT_FOR_KEYWORD_MATCH); i++) { booleanQuery.add(getTermQuery(F_SYNTERM, tokens.get(i), false), BooleanClause.Occur.SHOULD); } return booleanQuery; } public static String getJointTokens(String input) { return StringUtils.join(getTokensFromAnalyzer(input), ' '); } /** * * @param input * @return a list of lower-case tokens which strips accents & punctuation * @throws IOException */ public static List<String> getTokensFromAnalyzer(String input) { StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input)); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); CharTermAttribute charTermAttribute = result.addAttribute(CharTermAttribute.class); List<String> termList = new ArrayList<String>(); try { tokenStream.reset(); while (result.incrementToken()) { String term = charTermAttribute.toString(); termList.add(term); } result.close(); } catch (IOException e) { LOGGER.debug(e); } if (termList.size() == 1) { // require exact match when the input has only one token termList.clear(); termList.add(StringUtils.stripAccents(input.toLowerCase())); } return termList; } }