AbstractDictionarySearcher.java example

Explorer
data-quality-master
package org.talend.dataquality.semantic.index;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;

/**
 * Created by jteuladedenantes on 16/11/16.
 */
public abstract class AbstractDictionarySearcher {

    private static final Logger LOGGER = Logger.getLogger(AbstractDictionarySearcher.class);

    public static final String F_ID = "docid";//$NON-NLS-1$

    public static final String F_WORD = "word";//$NON-NLS-1$

    public static final String F_SYNTERM = "synterm";//$NON-NLS-1$

    public static final String F_RAW = "raw";

    public static final String F_CATID = "catid";

    protected int topDocLimit = 3;

    private int maxEdits = 2; // Default value

    private static final int MAX_TOKEN_COUNT_FOR_KEYWORD_MATCH = 20;

    private static final int MAX_CHAR_COUNT_FOR_DICTIONARY_MATCH = 100;

    protected DictionarySearchMode searchMode = DictionarySearchMode.MATCH_SEMANTIC_DICTIONARY;

    public abstract TopDocs searchDocumentBySynonym(String stringToSearch) throws IOException;

    public abstract Document getDocument(int docNum);

    /**
     * Method "setTopDocLimit" set the maximum number of documents to return after a search.
     *
     * @param topDocLimit the limit
     */
    public void setTopDocLimit(int topDocLimit) {
        this.topDocLimit = topDocLimit;
    }

    public void setMaxEdits(int maxEdits) {
        this.maxEdits = maxEdits;
    }

    public DictionarySearchMode getSearchMode() {
        return searchMode;
    }

    public void setSearchMode(DictionarySearchMode searchMode) {
        this.searchMode = searchMode;
    }

    private Query getTermQuery(String field, String text, boolean fuzzy) {
        Term term = new Term(field, text);
        return fuzzy ? new FuzzyQuery(term, maxEdits) : new TermQuery(term);
    }

    /**
     * @param input
     * @return
     * @throws IOException
     */
    protected Query createQueryForSemanticDictionaryMatch(String input) throws IOException {
        // for dictionary search, ignore searching for input containing too many tokens
        if (input.length() > MAX_CHAR_COUNT_FOR_DICTIONARY_MATCH) {
            return new TermQuery(new Term(F_SYNTERM, StringUtils.EMPTY));
        }

        return getTermQuery(F_SYNTERM, StringUtils.join(getTokensFromAnalyzer(input), ' '), false);
    }

    /**
     *
     *
     * @param input
     * @return
     * @throws IOException
     */
    protected Query createQueryForSemanticKeywordMatch(String input) throws IOException {
        BooleanQuery booleanQuery = new BooleanQuery();
        List<String> tokens = getTokensFromAnalyzer(input);
        // for keyword search, when the token count exceeds MAX_TOKEN_COUNT_FOR_KEYWORD_MATCH, only search the beginning
        // tokens from input
        for (int i = 0; i < Math.min(tokens.size(), MAX_TOKEN_COUNT_FOR_KEYWORD_MATCH); i++) {
            booleanQuery.add(getTermQuery(F_SYNTERM, tokens.get(i), false), BooleanClause.Occur.SHOULD);
        }
        return booleanQuery;
    }

    public static String getJointTokens(String input) {
        return StringUtils.join(getTokensFromAnalyzer(input), ' ');
    }

    /**
     *
     * @param input
     * @return a list of lower-case tokens which strips accents & punctuation
     * @throws IOException
     */
    public static List<String> getTokensFromAnalyzer(String input) {
        StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
        TokenStream result = new StandardFilter(tokenStream);
        result = new LowerCaseFilter(result);
        result = new ASCIIFoldingFilter(result);
        CharTermAttribute charTermAttribute = result.addAttribute(CharTermAttribute.class);
        List<String> termList = new ArrayList<String>();
        try {
            tokenStream.reset();
            while (result.incrementToken()) {
                String term = charTermAttribute.toString();
                termList.add(term);
            }
            result.close();
        } catch (IOException e) {
            LOGGER.debug(e);
        }
        if (termList.size() == 1) { // require exact match when the input has only one token
            termList.clear();
            termList.add(StringUtils.stripAccents(input.toLowerCase()));
        }
        return termList;
    }

}