SynonymIndexSearcher.java example

Explorer
data-quality-master
// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.standardization.index;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;

/**
 * @author scorreia A class to create an index with synonyms.
 */
public class SynonymIndexSearcher {

    private static final Logger LOGGER = Logger.getLogger(SynonymIndexSearcher.class);

    public enum SynonymSearchMode {

        MATCH_ANY("MATCH_ANY"),
        MATCH_PARTIAL("MATCH_PARTIAL"),
        MATCH_ALL("MATCH_ALL"),
        MATCH_EXACT("MATCH_EXACT"),
        MATCH_ANY_FUZZY("MATCH_ANY_FUZZY"),
        MATCH_ALL_FUZZY("MATCH_ALL_FUZZY"),

        /**
         * @deprecated moved to DictionarySearcher
         */
        MATCH_SEMANTIC_DICTIONARY("MATCH_SEMANTIC_DICTIONARY"), // Used only for searching semantic dictionary
        /**
         * @deprecated moved to DictionarySearcher
         */
        MATCH_SEMANTIC_KEYWORD("MATCH_SEMANTIC_KEYWORD");// Used only for searching semantic keyword

        private String label;

        SynonymSearchMode(String label) {
            this.label = label;
        }

        private String getLabel() {
            return label;
        }

        /**
         * Method "get".
         *
         * @param label the label of the match mode
         * @return the match mode type given the label or null
         */
        public static SynonymSearchMode get(String label) {
            for (SynonymSearchMode type : SynonymSearchMode.values()) {
                if (type.getLabel().equalsIgnoreCase(label)) {
                    return type;
                }
            }
            return MATCH_ANY; // default value
        }
    }

    public static final String F_WORD = "word";//$NON-NLS-1$

    public static final String F_SYN = "syn";//$NON-NLS-1$

    public static final String F_WORDTERM = "wordterm";//$NON-NLS-1$

    public static final String F_SYNTERM = "synterm";//$NON-NLS-1$

    private SearcherManager mgr;

    private int topDocLimit = 3;

    private int maxEdits = 1; // Default value

    private static final float WORD_TERM_BOOST = 2F;

    private static final float WORD_BOOST = 1.5F;

    private static final int MAX_TOKEN_COUNT_FOR_SEMANTIC_MATCH = 20;

    private Analyzer analyzer;

    private SynonymSearchMode searchMode = SynonymSearchMode.MATCH_ANY;

    private float matchingThreshold = 0f;

    /**
     * The slop is only used for
     * {@link org.talend.dataquality.standardization.index.SynonymIndexSearcher.SynonymSearchMode#MATCH_PARTIAL}.
     * <p>
     * By default, the slop factor is one, meaning only one gap between the searched tokens is allowed.
     * <p>
     * For example: "the brown" can match "the quick brown fox", but "the fox" will not match it, except that we set the slop
     * value to 2 or greater.
     */
    private int slop = 1;

    /**
     * instantiate an index searcher. A call to the index initialization method such as {@link #openIndexInFS(URI)} is
     * required before using any other method.
     * 
     * @deprecated avoid using this constructor
     */
    @Deprecated
    public SynonymIndexSearcher() {
    }

    /**
     * SynonymIndexSearcher constructor creates this searcher and initializes the index.
     *
     * @param indexPath the path to the index.
     */
    public SynonymIndexSearcher(String indexPath) {
        try {
            openIndexInFS(indexPath);
        } catch (IOException e) {
            LOGGER.error("Unable to open synonym index.", e);
        }
    }

    SynonymIndexSearcher(Directory indexDir) throws IOException {
        mgr = new SearcherManager(indexDir, null);
    }

    /**
     * Method "openIndexInFS" opens a FS folder index.
     *
     * @param path the path of the index folder
     * @throws java.io.IOException if file does not exist, or any other problem
     */
    public void openIndexInFS(String path) throws IOException {
        FSDirectory indexDir = FSDirectory.open(new File(path));
        mgr = new SearcherManager(indexDir, null);
    }

    /**
     * search a document by the word.
     *
     * @param word
     * @return
     * @throws java.io.IOException
     */
    public TopDocs searchDocumentByWord(String word) {
        if (word == null) {
            return null;
        }
        String tempWord = word.trim();
        if ("".equals(tempWord)) { //$NON-NLS-1$
            return null;
        }
        TopDocs docs = null;
        try {
            final IndexSearcher searcher = mgr.acquire();
            Query query = createWordQueryFor(tempWord);
            docs = searcher.search(query, topDocLimit);
            mgr.release(searcher);
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
        return docs;
    }

    /**
     * search for documents by one of the synonym (which may be the word).
     *
     * @param stringToSearch
     * @return
     * @throws java.io.IOException
     */
    public TopDocs searchDocumentBySynonym(String stringToSearch) throws IOException {
        TopDocs topDocs = null;
        Query query;
        switch (searchMode) {
        case MATCH_ANY:
            query = createCombinedQueryFor(stringToSearch, false, false);
            break;
        case MATCH_PARTIAL:
            query = createCombinedQueryForPartialMatch(stringToSearch);
            break;
        case MATCH_ALL:
            query = createCombinedQueryFor(stringToSearch, false, true);
            break;
        case MATCH_EXACT:
            query = createCombinedQueryForExactMatch(stringToSearch);
            break;
        case MATCH_ANY_FUZZY:
            query = createCombinedQueryFor(stringToSearch, true, false);
            break;
        case MATCH_ALL_FUZZY:
            query = createCombinedQueryFor(stringToSearch, true, true);
            break;
        case MATCH_SEMANTIC_DICTIONARY:
            query = createQueryForSemanticDictionaryMatch(stringToSearch);
            break;
        case MATCH_SEMANTIC_KEYWORD:
            query = createQueryForSemanticKeywordMatch(stringToSearch);
            break;
        default: // do the same as MATCH_ANY mode
            query = createCombinedQueryFor(stringToSearch, false, false);
            break;
        }
        final IndexSearcher searcher = mgr.acquire();
        topDocs = searcher.search(query, topDocLimit);
        mgr.release(searcher);
        return topDocs;
    }

    /**
     * Count the synonyms of the first document found by a query on word.
     *
     * @param word
     * @return the number of synonyms
     */
    public int getSynonymCount(String word) {
        try {
            Query query = createWordQueryFor(word);
            TopDocs docs;
            final IndexSearcher searcher = mgr.acquire();
            docs = searcher.search(query, topDocLimit);
            if (docs.totalHits > 0) {
                Document doc = searcher.doc(docs.scoreDocs[0].doc);
                String[] synonyms = doc.getValues(F_SYN);
                return synonyms.length;
            }
            mgr.release(searcher);
        } catch (IOException e) {
            LOGGER.error(e);
        }
        return -1;
    }

    /**
     * Get a document from search result by its document number.
     *
     * @param docNum the doc number
     * @return the document (can be null if any problem)
     */
    public Document getDocument(int docNum) {
        Document doc = null;
        try {
            final IndexSearcher searcher = mgr.acquire();
            IndexReader reader = searcher.getIndexReader();
            Bits liveDocs = MultiFields.getLiveDocs(reader);
            if (liveDocs != null && !liveDocs.get(docNum)) {
                return null;
            } else {
                doc = reader.document(docNum);
                mgr.release(searcher);
                return doc;
            }
        } catch (IOException e) {
            LOGGER.error(e);
        }
        return doc;
    }

    /**
     * Method "getWordByDocNumber".
     *
     * @param docNo the document number
     * @return the document or null
     */
    public String getWordByDocNumber(int docNo) {
        Document document = getDocument(docNo);
        return document != null ? document.getValues(F_WORD)[0] : null;
    }

    /**
     * Method "getSynonymsByDocNumber".
     *
     * @param docNo the doc number
     * @return the synonyms or null if no document is found
     */
    public String[] getSynonymsByDocNumber(int docNo) {
        Document document = getDocument(docNo);
        return document != null ? document.getValues(F_SYN) : null;
    }

    /**
     * Method "getNumDocs".
     *
     * @return the number of documents in the index
     */
    public int getNumDocs() {
        try {
            final IndexSearcher searcher = mgr.acquire();
            final int numDocs = searcher.getIndexReader().numDocs();
            mgr.release(searcher);
            return numDocs;
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
        return -1;
    }

    /**
     * Method "getMaxDoc".
     *
     * @return the the max document number of the index
     */
    public int getMaxDoc() {
        try {
            final IndexSearcher searcher = mgr.acquire();
            final int numDocs = searcher.getIndexReader().maxDoc();
            mgr.release(searcher);
            return numDocs;
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
        return -1;
    }

    /**
     * Getter for topDocLimit.
     *
     * @return the topDocLimit
     */
    public int getTopDocLimit() {
        return this.topDocLimit;
    }

    /**
     * Method "setTopDocLimit" set the maximum number of documents to return after a search.
     *
     * @param topDocLimit the limit
     */
    public void setTopDocLimit(int topDocLimit) {
        this.topDocLimit = topDocLimit;
    }

    /**
     * Getter for slop. The slop is the maximum number of moves allowed to put the terms in order.
     *
     * @return the slop
     */
    public int getSlop() {
        return this.slop;
    }

    /**
     * Sets the slop.
     *
     * @param slop the slop to set
     */
    public void setSlop(int slop) {
        this.slop = slop;
    }

    /**
     * Method "setAnalyzer".
     *
     * @param analyzer the analyzer to use in searches.
     */
    public void setAnalyzer(Analyzer analyzer) {
        this.analyzer = analyzer;
    }

    /**
     *
     * @return the analyzer used in searches (StandardAnalyzer by default)
     */
    public Analyzer getAnalyzer() {
        if (analyzer == null) {
            analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
        }
        return this.analyzer;
    }

    private Query createWordQueryFor(String stringToSearch) {
        return new TermQuery(new Term(F_WORDTERM, stringToSearch.toLowerCase()));
    }

    private Query getTermQuery(String field, String text, boolean fuzzy) {
        Term term = new Term(field, text);
        return fuzzy ? new FuzzyQuery(term, maxEdits) : new TermQuery(term);
    }

    /**
     * create a combined query who searches for the input tokens separately (with QueryParser) and also the entire input
     * string (with TermQuery or FuzzyQuery).
     *
     * @param input
     * @param fuzzy this options decides whether output the fuzzy matches
     * @param allMatch this options means the result should be returned only if all tokens are found in the index
     * @return
     * @throws java.io.IOException
     */
    private Query createCombinedQueryFor(String input, boolean fuzzy, boolean allMatch) throws IOException {
        BooleanQuery combinedQuery = new BooleanQuery();
        Query wordTermQuery, synTermQuery, wordQuery, synQuery;
        wordTermQuery = getTermQuery(F_WORDTERM, input.toLowerCase(), fuzzy);
        synTermQuery = getTermQuery(F_SYNTERM, input.toLowerCase(), fuzzy);

        List<String> tokens = getTokensFromAnalyzer(input);
        wordQuery = new BooleanQuery();
        synQuery = new BooleanQuery();
        for (String token : tokens) {
            ((BooleanQuery) wordQuery).add(getTermQuery(F_WORD, token, fuzzy),
                    allMatch ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD);
            ((BooleanQuery) synQuery).add(getTermQuery(F_SYN, token, fuzzy),
                    allMatch ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD);
        }

        // increase importance of the reference word
        wordTermQuery.setBoost(WORD_TERM_BOOST);
        wordQuery.setBoost(WORD_BOOST);

        combinedQuery.add(wordTermQuery, BooleanClause.Occur.SHOULD);
        combinedQuery.add(synTermQuery, BooleanClause.Occur.SHOULD);
        combinedQuery.add(wordQuery, BooleanClause.Occur.SHOULD);
        combinedQuery.add(synQuery, BooleanClause.Occur.SHOULD);
        return combinedQuery;
    }

    /**
     * create a combined query who searches for the input tokens in order (with double quotes around the input) and also
     * the entire input string (with TermQuery).
     *
     * @param input
     * @return
     * @throws java.io.IOException
     */
    private Query createCombinedQueryForPartialMatch(String input) throws IOException {
        BooleanQuery combinedQuery = new BooleanQuery();
        Query wordTermQuery, synTermQuery, wordQuery, synQuery;
        wordTermQuery = getTermQuery(F_WORDTERM, input.toLowerCase(), false);
        synTermQuery = getTermQuery(F_SYNTERM, input.toLowerCase(), false);

        List<String> tokens = getTokensFromAnalyzer(input);
        wordQuery = new PhraseQuery();
        ((PhraseQuery) wordQuery).setSlop(slop);
        synQuery = new PhraseQuery();
        ((PhraseQuery) synQuery).setSlop(slop);
        for (String token : tokens) {
            token = token.toLowerCase();
            ((PhraseQuery) wordQuery).add(new Term(F_WORD, token));
            ((PhraseQuery) synQuery).add(new Term(F_SYN, token));
        }
        // increase importance of the reference word
        wordTermQuery.setBoost(WORD_TERM_BOOST);
        wordQuery.setBoost(WORD_BOOST);

        combinedQuery.add(wordTermQuery, BooleanClause.Occur.SHOULD);
        combinedQuery.add(synTermQuery, BooleanClause.Occur.SHOULD);
        combinedQuery.add(wordQuery, BooleanClause.Occur.SHOULD);
        combinedQuery.add(synQuery, BooleanClause.Occur.SHOULD);
        return combinedQuery;
    }

    /**
     * @param input
     * @return
     * @throws IOException
     * @deprecated moved to DictionarySearcher
     */
    @Deprecated
    private Query createQueryForSemanticDictionaryMatch(String input) throws IOException {
        List<String> tokens = getTokensFromAnalyzer(input);
        // for dictionary search, ignore searching for input containing too many tokens
        if (tokens.size() > MAX_TOKEN_COUNT_FOR_SEMANTIC_MATCH) {
            return new TermQuery(new Term(F_SYNTERM, StringUtils.EMPTY));
        }
        Query synTermQuery = getTermQuery(F_SYNTERM, StringUtils.join(tokens, ' '), false);

        return synTermQuery;
    }

    /**
     * @param input
     * @return
     * @throws IOException
     * @deprecated moved to DictionarySearcher
     */
    @Deprecated
    private Query createQueryForSemanticKeywordMatch(String input) throws IOException {
        BooleanQuery booleanQuery = new BooleanQuery();
        List<String> tokens = getTokensFromAnalyzer(input);
        // for keyword search, only search the beginning tokens from input
        if (tokens.size() > MAX_TOKEN_COUNT_FOR_SEMANTIC_MATCH) {
            for (int i = 0; i < MAX_TOKEN_COUNT_FOR_SEMANTIC_MATCH; i++) {
                booleanQuery.add(getTermQuery(F_SYN, tokens.get(i), false), BooleanClause.Occur.SHOULD);
            }
        } else {
            for (String token : tokens) {
                booleanQuery.add(getTermQuery(F_SYN, token, false), BooleanClause.Occur.SHOULD);
            }
        }
        return booleanQuery;
    }

    /**
     * create a combined query who searches for the input tokens in order (with double quotes around the input) and also
     * the entire input string (with TermQuery).
     *
     * @param input
     * @return
     * @throws java.io.IOException
     */
    private Query createCombinedQueryForExactMatch(String input) throws IOException {
        BooleanQuery combinedQuery = new BooleanQuery();
        Query wordTermQuery, synTermQuery;
        wordTermQuery = getTermQuery(F_WORDTERM, input.toLowerCase(), false);
        synTermQuery = getTermQuery(F_SYNTERM, input.toLowerCase(), false);
        // increase importance of the reference word
        wordTermQuery.setBoost(WORD_TERM_BOOST);

        combinedQuery.add(wordTermQuery, BooleanClause.Occur.SHOULD);
        combinedQuery.add(synTermQuery, BooleanClause.Occur.SHOULD);
        return combinedQuery;
    }

    public void close() {
        try {
            if (mgr != null) {
                IndexSearcher acquire = mgr.acquire();
                if (acquire != null) {
                    IndexReader indexReader = acquire.getIndexReader();
                    if (indexReader != null) {
                        indexReader.close();
                    }
                }
            }
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
    }

    public SynonymSearchMode getSearchMode() {
        return searchMode;
    }

    public void setSearchMode(SynonymSearchMode searchMode) {
        this.searchMode = searchMode;
    }

    public void setMaxEdits(int maxEdits) {
        this.maxEdits = maxEdits;
    }

    public float getMatchingThreshold() {
        return matchingThreshold;
    }

    public void setMatchingThreshold(float matchingThreshold) {
        this.matchingThreshold = matchingThreshold;
    }

    public void setMatchingThreshold(double matchingThreshold) {
        this.matchingThreshold = (float) matchingThreshold;
    }

    /**
     * 
     * @param input
     * @return a list of lower-case tokens which strips accents & punctuation
     * @throws IOException
     */
    public static List<String> getTokensFromAnalyzer(String input) throws IOException {
        StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
        TokenStream result = new StandardFilter(tokenStream);
        result = new LowerCaseFilter(result);
        result = new ASCIIFoldingFilter(result);
        CharTermAttribute charTermAttribute = result.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        List<String> termList = new ArrayList<String>();
        while (result.incrementToken()) {
            String term = charTermAttribute.toString();
            termList.add(term);
        }
        result.close();
        return termList;
    }
}