DictionarySearcher.java example

Explorer
data-quality-master
// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.semantic.index;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.*;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.talend.dataquality.semantic.model.DQCategory;
import org.talend.dataquality.semantic.model.ValidationMode;

public class DictionarySearcher extends AbstractDictionarySearcher {

    public static final String UNABLE_TO_OPEN_INDEX = "Unable to open synonym index.";

    private static final Logger LOGGER = Logger.getLogger(DictionarySearcher.class);

    private SearcherManager mgr;

    private Map<String, CachingWrapperFilter> categoryToCache = new HashMap<>();

    /**
     * SynonymIndexSearcher constructor creates this searcher and initializes the index.
     *
     * @param indexPath the path to the index.
     */
    public DictionarySearcher(String indexPath) {
        try {
            FSDirectory indexDir = FSDirectory.open(new File(indexPath));
            mgr = new SearcherManager(indexDir, null);
        } catch (IOException e) {
            LOGGER.error(UNABLE_TO_OPEN_INDEX, e);
        }
    }

    /**
     * SynonymIndexSearcher constructor creates this searcher and initializes the index.
     *
     * @param indexPathURI the path to the index.
     */
    public DictionarySearcher(URI indexPathURI) {
        try {
            Directory indexDir = ClassPathDirectory.open(indexPathURI);
            mgr = new SearcherManager(indexDir, null);
        } catch (IOException e) {
            LOGGER.error(UNABLE_TO_OPEN_INDEX, e);
        }
    }

    public DictionarySearcher(Directory indexDir) {
        try {
            mgr = new SearcherManager(indexDir, null);
        } catch (IOException e) {
            LOGGER.error(UNABLE_TO_OPEN_INDEX, e);
        }
    }

    /**
     * search for documents by one of the synonym (which may be the word).
     *
     * @param stringToSearch
     * @return
     * @throws java.io.IOException
     */
    @Override
    public TopDocs searchDocumentBySynonym(String stringToSearch) throws IOException {
        Query query;
        switch (searchMode) {
        case MATCH_SEMANTIC_KEYWORD:
            query = createQueryForSemanticKeywordMatch(stringToSearch);
            break;
        case MATCH_SEMANTIC_DICTIONARY:
        default: // do the same as MATCH_SEMANTIC_DICTIONARY mode
            query = createQueryForSemanticDictionaryMatch(stringToSearch);
            break;
        }
        final IndexSearcher searcher = mgr.acquire();
        TopDocs topDocs = searcher.search(query, topDocLimit);
        mgr.release(searcher);
        return topDocs;
    }

    /**
     * Get a document from search result by its document number.
     *
     * @param docNum the doc number
     * @return the document (can be null if any problem)
     */
    @Override
    public Document getDocument(int docNum) {
        Document doc = null;
        try {
            final IndexSearcher searcher = mgr.acquire();
            doc = searcher.doc(docNum);
            mgr.release(searcher);
        } catch (IOException e) {
            LOGGER.error(e);
        }
        return doc;
    }

    public boolean validDocumentWithCategories(String stringToSearch, DQCategory semanticType, Set<DQCategory> children)
            throws IOException {
        Query query;
        switch (searchMode) {
        case MATCH_SEMANTIC_KEYWORD:
            query = createQueryForSemanticKeywordMatch(stringToSearch);
            break;
        case MATCH_SEMANTIC_DICTIONARY:
        default: // do the same as MATCH_SEMANTIC_DICTIONARY mode
            query = createQueryForSemanticDictionaryMatch(stringToSearch);
            break;
        }
        final IndexSearcher searcher = mgr.acquire();
        CachingWrapperFilter cachingWrapperFilter = categoryToCache.get(semanticType.getId());
        boolean hasChildren = !CollectionUtils.isEmpty(children);

        // define the subset in which we will search
        if (cachingWrapperFilter == null) {
            if (hasChildren) {
                Set<String> childrenId = new HashSet<>();
                for (DQCategory category : children)
                    childrenId.add(category.getId());
                cachingWrapperFilter = new CachingWrapperFilter(
                        new FieldCacheTermsFilter(F_CATID, childrenId.toArray(new String[childrenId.size()])));
            } else
                cachingWrapperFilter = new CachingWrapperFilter(new FieldCacheTermsFilter(F_CATID, semanticType.getId()));
            categoryToCache.put(semanticType.getId(), cachingWrapperFilter);
        }

        // the lucene search
        TopDocs docs = searcher.search(query, cachingWrapperFilter, topDocLimit);

        ValidationMode validationMode = ValidationMode.EXACT;
        if (!hasChildren && semanticType.getValidationMode() != null) {
            validationMode = semanticType.getValidationMode();
            if (ValidationMode.SIMPLIFIED.equals(validationMode)) {
                mgr.release(searcher);
                return docs.totalHits != 0;
            }
        }

        boolean validDocument = false;
        for (int i = 0; i < docs.scoreDocs.length && !validDocument; i++) {
            Document document = searcher.doc(docs.scoreDocs[i].doc);
            if (hasChildren)
                validationMode = getChildrenValidationMode(children, document);
            validDocument = validDocumentByValidationMode(document, stringToSearch, validationMode);
        }
        mgr.release(searcher);
        return validDocument;
    }

    /**
     * This method searches the validation mode associated to the found document.
     * For that, we have to find its category.
     *
     * @param children the categories
     * @param document the found document
     * @return the validation mode
     */
    private ValidationMode getChildrenValidationMode(Set<DQCategory> children, Document document) {
        for (DQCategory child : children)
            if (child.getId().equals(document.getField(DictionarySearcher.F_CATID).stringValue()))
                return child.getValidationMode() != null ? child.getValidationMode() : ValidationMode.EXACT;
        // We should never enter here if everything went well, hence we log at an error level
        LOGGER.error("The document does not belong to any children category");
        return ValidationMode.EXACT;
    }

    /**
     * this method valids stringToSearch according to a validation mode
     *
     * @param document found in lucene index
     * @param stringToSearch to valid
     * @param validationMode
     * @return a boolean
     * @throws IOException
     */
    private boolean validDocumentByValidationMode(Document document, String stringToSearch, ValidationMode validationMode)
            throws IOException {

        if (ValidationMode.SIMPLIFIED.equals(validationMode))
            return true;
        String transformedString = transformSringByValidationMode(stringToSearch, validationMode);
        if (!StringUtils.isEmpty(transformedString))
            for (String raw : document.getValues(DictionarySearcher.F_RAW))
                if (transformedString.equals(transformSringByValidationMode(raw, validationMode)))
                    return true;
        return false;
    }

    /**
     * This method transforms a string according to a validation mode
     *
     * @param stringToTransform
     * @param validationMode
     * @return the transformed string
     */
    private String transformSringByValidationMode(String stringToTransform, ValidationMode validationMode) {
        if (ValidationMode.EXACT_IGNORE_CASE_AND_ACCENT.equals(validationMode))
            return StringUtils.stripAccents(stringToTransform.toLowerCase());
        return stringToTransform;
    }

    /**
     *
     * @param semanticTypes
     * @return
     * @throws IOException
     */
    protected Filter createFilterForSemanticTypes(Set<String> semanticTypes) {
        List<Term> terms = new ArrayList<>();
        for (String semanticType : semanticTypes) {
            terms.add(new Term(F_WORD, semanticType));
        }
        return new TermsFilter(terms);
    }

    /**
     * Method "getWordByDocNumber".
     *
     * @param docNo the document number
     * @return the document or null
     */
    public String getWordByDocNumber(int docNo) {
        Document document = getDocument(docNo);
        return document != null ? document.getValues(F_WORD)[0] : null;
    }

    /**
     * Method "getSynonymsByDocNumber".
     *
     * @param docNo the doc number
     * @return the synonyms or null if no document is found
     */
    public String[] getSynonymsByDocNumber(int docNo) {
        Document document = getDocument(docNo);
        return document != null ? document.getValues(F_RAW) : null;
    }

    /**
     * Method "getNumDocs".
     *
     * @return the number of documents in the index
     */
    public int getNumDocs() {
        try {
            final IndexSearcher searcher = mgr.acquire();
            final int numDocs = searcher.getIndexReader().numDocs();
            mgr.release(searcher);
            return numDocs;
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
        return -1;
    }

    public void close() {
        try {
            mgr.acquire().getIndexReader().close();
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
    }

    public void maybeRefreshIndex() {
        try {
            mgr.maybeRefresh();
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
    }
}