// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.standardization.index; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.net.URI; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Bits; /** * @author scorreia A class to create an index with synonyms. */ public class SynonymIndexSearcher { private static final Logger LOGGER = Logger.getLogger(SynonymIndexSearcher.class); public enum SynonymSearchMode { MATCH_ANY("MATCH_ANY"), MATCH_PARTIAL("MATCH_PARTIAL"), MATCH_ALL("MATCH_ALL"), MATCH_EXACT("MATCH_EXACT"), MATCH_ANY_FUZZY("MATCH_ANY_FUZZY"), MATCH_ALL_FUZZY("MATCH_ALL_FUZZY"), /** * @deprecated moved to DictionarySearcher */ MATCH_SEMANTIC_DICTIONARY("MATCH_SEMANTIC_DICTIONARY"), // Used only for searching semantic dictionary /** * @deprecated moved to DictionarySearcher */ MATCH_SEMANTIC_KEYWORD("MATCH_SEMANTIC_KEYWORD");// Used only for searching semantic keyword private String label; SynonymSearchMode(String label) { this.label = label; } private String getLabel() { return label; } /** * Method "get". * * @param label the label of the match mode * @return the match mode type given the label or null */ public static SynonymSearchMode get(String label) { for (SynonymSearchMode type : SynonymSearchMode.values()) { if (type.getLabel().equalsIgnoreCase(label)) { return type; } } return MATCH_ANY; // default value } } public static final String F_WORD = "word";//$NON-NLS-1$ public static final String F_SYN = "syn";//$NON-NLS-1$ public static final String F_WORDTERM = "wordterm";//$NON-NLS-1$ public static final String F_SYNTERM = "synterm";//$NON-NLS-1$ private SearcherManager mgr; private int topDocLimit = 3; private int maxEdits = 1; // Default value private static final float WORD_TERM_BOOST = 2F; private static final float WORD_BOOST = 1.5F; private static final int MAX_TOKEN_COUNT_FOR_SEMANTIC_MATCH = 20; private Analyzer analyzer; private SynonymSearchMode searchMode = SynonymSearchMode.MATCH_ANY; private float matchingThreshold = 0f; /** * The slop is only used for * {@link org.talend.dataquality.standardization.index.SynonymIndexSearcher.SynonymSearchMode#MATCH_PARTIAL}. * <p> * By default, the slop factor is one, meaning only one gap between the searched tokens is allowed. * <p> * For example: "the brown" can match "the quick brown fox", but "the fox" will not match it, except that we set the slop * value to 2 or greater. */ private int slop = 1; /** * instantiate an index searcher. A call to the index initialization method such as {@link #openIndexInFS(URI)} is * required before using any other method. * * @deprecated avoid using this constructor */ @Deprecated public SynonymIndexSearcher() { } /** * SynonymIndexSearcher constructor creates this searcher and initializes the index. * * @param indexPath the path to the index. */ public SynonymIndexSearcher(String indexPath) { try { openIndexInFS(indexPath); } catch (IOException e) { LOGGER.error("Unable to open synonym index.", e); } } SynonymIndexSearcher(Directory indexDir) throws IOException { mgr = new SearcherManager(indexDir, null); } /** * Method "openIndexInFS" opens a FS folder index. * * @param path the path of the index folder * @throws java.io.IOException if file does not exist, or any other problem */ public void openIndexInFS(String path) throws IOException { FSDirectory indexDir = FSDirectory.open(new File(path)); mgr = new SearcherManager(indexDir, null); } /** * search a document by the word. * * @param word * @return * @throws java.io.IOException */ public TopDocs searchDocumentByWord(String word) { if (word == null) { return null; } String tempWord = word.trim(); if ("".equals(tempWord)) { //$NON-NLS-1$ return null; } TopDocs docs = null; try { final IndexSearcher searcher = mgr.acquire(); Query query = createWordQueryFor(tempWord); docs = searcher.search(query, topDocLimit); mgr.release(searcher); } catch (IOException e) { LOGGER.error(e.getMessage(), e); } return docs; } /** * search for documents by one of the synonym (which may be the word). * * @param stringToSearch * @return * @throws java.io.IOException */ public TopDocs searchDocumentBySynonym(String stringToSearch) throws IOException { TopDocs topDocs = null; Query query; switch (searchMode) { case MATCH_ANY: query = createCombinedQueryFor(stringToSearch, false, false); break; case MATCH_PARTIAL: query = createCombinedQueryForPartialMatch(stringToSearch); break; case MATCH_ALL: query = createCombinedQueryFor(stringToSearch, false, true); break; case MATCH_EXACT: query = createCombinedQueryForExactMatch(stringToSearch); break; case MATCH_ANY_FUZZY: query = createCombinedQueryFor(stringToSearch, true, false); break; case MATCH_ALL_FUZZY: query = createCombinedQueryFor(stringToSearch, true, true); break; case MATCH_SEMANTIC_DICTIONARY: query = createQueryForSemanticDictionaryMatch(stringToSearch); break; case MATCH_SEMANTIC_KEYWORD: query = createQueryForSemanticKeywordMatch(stringToSearch); break; default: // do the same as MATCH_ANY mode query = createCombinedQueryFor(stringToSearch, false, false); break; } final IndexSearcher searcher = mgr.acquire(); topDocs = searcher.search(query, topDocLimit); mgr.release(searcher); return topDocs; } /** * Count the synonyms of the first document found by a query on word. * * @param word * @return the number of synonyms */ public int getSynonymCount(String word) { try { Query query = createWordQueryFor(word); TopDocs docs; final IndexSearcher searcher = mgr.acquire(); docs = searcher.search(query, topDocLimit); if (docs.totalHits > 0) { Document doc = searcher.doc(docs.scoreDocs[0].doc); String[] synonyms = doc.getValues(F_SYN); return synonyms.length; } mgr.release(searcher); } catch (IOException e) { LOGGER.error(e); } return -1; } /** * Get a document from search result by its document number. * * @param docNum the doc number * @return the document (can be null if any problem) */ public Document getDocument(int docNum) { Document doc = null; try { final IndexSearcher searcher = mgr.acquire(); IndexReader reader = searcher.getIndexReader(); Bits liveDocs = MultiFields.getLiveDocs(reader); if (liveDocs != null && !liveDocs.get(docNum)) { return null; } else { doc = reader.document(docNum); mgr.release(searcher); return doc; } } catch (IOException e) { LOGGER.error(e); } return doc; } /** * Method "getWordByDocNumber". * * @param docNo the document number * @return the document or null */ public String getWordByDocNumber(int docNo) { Document document = getDocument(docNo); return document != null ? document.getValues(F_WORD)[0] : null; } /** * Method "getSynonymsByDocNumber". * * @param docNo the doc number * @return the synonyms or null if no document is found */ public String[] getSynonymsByDocNumber(int docNo) { Document document = getDocument(docNo); return document != null ? document.getValues(F_SYN) : null; } /** * Method "getNumDocs". * * @return the number of documents in the index */ public int getNumDocs() { try { final IndexSearcher searcher = mgr.acquire(); final int numDocs = searcher.getIndexReader().numDocs(); mgr.release(searcher); return numDocs; } catch (IOException e) { LOGGER.error(e.getMessage(), e); } return -1; } /** * Method "getMaxDoc". * * @return the the max document number of the index */ public int getMaxDoc() { try { final IndexSearcher searcher = mgr.acquire(); final int numDocs = searcher.getIndexReader().maxDoc(); mgr.release(searcher); return numDocs; } catch (IOException e) { LOGGER.error(e.getMessage(), e); } return -1; } /** * Getter for topDocLimit. * * @return the topDocLimit */ public int getTopDocLimit() { return this.topDocLimit; } /** * Method "setTopDocLimit" set the maximum number of documents to return after a search. * * @param topDocLimit the limit */ public void setTopDocLimit(int topDocLimit) { this.topDocLimit = topDocLimit; } /** * Getter for slop. The slop is the maximum number of moves allowed to put the terms in order. * * @return the slop */ public int getSlop() { return this.slop; } /** * Sets the slop. * * @param slop the slop to set */ public void setSlop(int slop) { this.slop = slop; } /** * Method "setAnalyzer". * * @param analyzer the analyzer to use in searches. */ public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } /** * * @return the analyzer used in searches (StandardAnalyzer by default) */ public Analyzer getAnalyzer() { if (analyzer == null) { analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET); } return this.analyzer; } private Query createWordQueryFor(String stringToSearch) { return new TermQuery(new Term(F_WORDTERM, stringToSearch.toLowerCase())); } private Query getTermQuery(String field, String text, boolean fuzzy) { Term term = new Term(field, text); return fuzzy ? new FuzzyQuery(term, maxEdits) : new TermQuery(term); } /** * create a combined query who searches for the input tokens separately (with QueryParser) and also the entire input * string (with TermQuery or FuzzyQuery). * * @param input * @param fuzzy this options decides whether output the fuzzy matches * @param allMatch this options means the result should be returned only if all tokens are found in the index * @return * @throws java.io.IOException */ private Query createCombinedQueryFor(String input, boolean fuzzy, boolean allMatch) throws IOException { BooleanQuery combinedQuery = new BooleanQuery(); Query wordTermQuery, synTermQuery, wordQuery, synQuery; wordTermQuery = getTermQuery(F_WORDTERM, input.toLowerCase(), fuzzy); synTermQuery = getTermQuery(F_SYNTERM, input.toLowerCase(), fuzzy); List<String> tokens = getTokensFromAnalyzer(input); wordQuery = new BooleanQuery(); synQuery = new BooleanQuery(); for (String token : tokens) { ((BooleanQuery) wordQuery).add(getTermQuery(F_WORD, token, fuzzy), allMatch ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD); ((BooleanQuery) synQuery).add(getTermQuery(F_SYN, token, fuzzy), allMatch ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD); } // increase importance of the reference word wordTermQuery.setBoost(WORD_TERM_BOOST); wordQuery.setBoost(WORD_BOOST); combinedQuery.add(wordTermQuery, BooleanClause.Occur.SHOULD); combinedQuery.add(synTermQuery, BooleanClause.Occur.SHOULD); combinedQuery.add(wordQuery, BooleanClause.Occur.SHOULD); combinedQuery.add(synQuery, BooleanClause.Occur.SHOULD); return combinedQuery; } /** * create a combined query who searches for the input tokens in order (with double quotes around the input) and also * the entire input string (with TermQuery). * * @param input * @return * @throws java.io.IOException */ private Query createCombinedQueryForPartialMatch(String input) throws IOException { BooleanQuery combinedQuery = new BooleanQuery(); Query wordTermQuery, synTermQuery, wordQuery, synQuery; wordTermQuery = getTermQuery(F_WORDTERM, input.toLowerCase(), false); synTermQuery = getTermQuery(F_SYNTERM, input.toLowerCase(), false); List<String> tokens = getTokensFromAnalyzer(input); wordQuery = new PhraseQuery(); ((PhraseQuery) wordQuery).setSlop(slop); synQuery = new PhraseQuery(); ((PhraseQuery) synQuery).setSlop(slop); for (String token : tokens) { token = token.toLowerCase(); ((PhraseQuery) wordQuery).add(new Term(F_WORD, token)); ((PhraseQuery) synQuery).add(new Term(F_SYN, token)); } // increase importance of the reference word wordTermQuery.setBoost(WORD_TERM_BOOST); wordQuery.setBoost(WORD_BOOST); combinedQuery.add(wordTermQuery, BooleanClause.Occur.SHOULD); combinedQuery.add(synTermQuery, BooleanClause.Occur.SHOULD); combinedQuery.add(wordQuery, BooleanClause.Occur.SHOULD); combinedQuery.add(synQuery, BooleanClause.Occur.SHOULD); return combinedQuery; } /** * @param input * @return * @throws IOException * @deprecated moved to DictionarySearcher */ @Deprecated private Query createQueryForSemanticDictionaryMatch(String input) throws IOException { List<String> tokens = getTokensFromAnalyzer(input); // for dictionary search, ignore searching for input containing too many tokens if (tokens.size() > MAX_TOKEN_COUNT_FOR_SEMANTIC_MATCH) { return new TermQuery(new Term(F_SYNTERM, StringUtils.EMPTY)); } Query synTermQuery = getTermQuery(F_SYNTERM, StringUtils.join(tokens, ' '), false); return synTermQuery; } /** * @param input * @return * @throws IOException * @deprecated moved to DictionarySearcher */ @Deprecated private Query createQueryForSemanticKeywordMatch(String input) throws IOException { BooleanQuery booleanQuery = new BooleanQuery(); List<String> tokens = getTokensFromAnalyzer(input); // for keyword search, only search the beginning tokens from input if (tokens.size() > MAX_TOKEN_COUNT_FOR_SEMANTIC_MATCH) { for (int i = 0; i < MAX_TOKEN_COUNT_FOR_SEMANTIC_MATCH; i++) { booleanQuery.add(getTermQuery(F_SYN, tokens.get(i), false), BooleanClause.Occur.SHOULD); } } else { for (String token : tokens) { booleanQuery.add(getTermQuery(F_SYN, token, false), BooleanClause.Occur.SHOULD); } } return booleanQuery; } /** * create a combined query who searches for the input tokens in order (with double quotes around the input) and also * the entire input string (with TermQuery). * * @param input * @return * @throws java.io.IOException */ private Query createCombinedQueryForExactMatch(String input) throws IOException { BooleanQuery combinedQuery = new BooleanQuery(); Query wordTermQuery, synTermQuery; wordTermQuery = getTermQuery(F_WORDTERM, input.toLowerCase(), false); synTermQuery = getTermQuery(F_SYNTERM, input.toLowerCase(), false); // increase importance of the reference word wordTermQuery.setBoost(WORD_TERM_BOOST); combinedQuery.add(wordTermQuery, BooleanClause.Occur.SHOULD); combinedQuery.add(synTermQuery, BooleanClause.Occur.SHOULD); return combinedQuery; } public void close() { try { if (mgr != null) { IndexSearcher acquire = mgr.acquire(); if (acquire != null) { IndexReader indexReader = acquire.getIndexReader(); if (indexReader != null) { indexReader.close(); } } } } catch (IOException e) { LOGGER.error(e.getMessage(), e); } } public SynonymSearchMode getSearchMode() { return searchMode; } public void setSearchMode(SynonymSearchMode searchMode) { this.searchMode = searchMode; } public void setMaxEdits(int maxEdits) { this.maxEdits = maxEdits; } public float getMatchingThreshold() { return matchingThreshold; } public void setMatchingThreshold(float matchingThreshold) { this.matchingThreshold = matchingThreshold; } public void setMatchingThreshold(double matchingThreshold) { this.matchingThreshold = (float) matchingThreshold; } /** * * @param input * @return a list of lower-case tokens which strips accents & punctuation * @throws IOException */ public static List<String> getTokensFromAnalyzer(String input) throws IOException { StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input)); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); CharTermAttribute charTermAttribute = result.addAttribute(CharTermAttribute.class); tokenStream.reset(); List<String> termList = new ArrayList<String>(); while (result.incrementToken()) { String term = charTermAttribute.toString(); termList.add(term); } result.close(); return termList; } }