package com.tyndalehouse.step.core.data.entities.impl; import static org.apache.lucene.util.Version.LUCENE_30; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.net.URI; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import com.tyndalehouse.step.core.data.common.TermsAndMaxCount; import com.tyndalehouse.step.core.utils.LuceneUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser.Operator; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.MMapDirectory; import org.slf4j.Logger; import com.tyndalehouse.step.core.data.AllResultsCollector; import com.tyndalehouse.step.core.data.AnalyzedPrefixSearchQueryParser; import com.tyndalehouse.step.core.data.EntityConfiguration; import com.tyndalehouse.step.core.data.EntityDoc; import com.tyndalehouse.step.core.data.EntityIndexReader; import com.tyndalehouse.step.core.exceptions.StepInternalException; import com.tyndalehouse.step.core.utils.IOUtils; /** * Reads an entity * * @author chrisburrell */ public class EntityIndexReaderImpl implements EntityIndexReader { private static final Logger LOGGER = org.slf4j.LoggerFactory.getLogger(EntityIndexReaderImpl.class); private IndexSearcher searcher; private Directory directory; private final EntityConfiguration config; private boolean memoryMapped; /** * Entity reader * * @param config the config about the reader * @param memoryMapped true to indicate index should be stored in memory */ public EntityIndexReaderImpl(final EntityConfiguration config, final boolean memoryMapped) { this.config = config; this.memoryMapped = memoryMapped; initialise(); } /** * Entity reader - does not initialise it entirely - careful when using this. * * @param config the config about the reader */ EntityIndexReaderImpl(final EntityConfiguration config) { this.config = config; } @Override public Analyzer getAnalyzer() { return this.config.getAnalyzerInstance(); } /** * Initialises the index reader */ private void initialise() { try { openDirectory(this.config, this.memoryMapped); if (this.directory != null) { this.searcher = new IndexSearcher(this.directory, true); } } catch (final IOException e) { LOGGER.warn("Index not readable - it may not yet have been created."); LOGGER.trace("Trace for exception:", e); } } @Override public void refresh() { close(); initialise(); } @Override public void close() { IOUtils.closeQuietly(this.searcher); IOUtils.closeQuietly(this.directory); } /** * Gets the best implementation of the directory * * @param configuration config * @param memoryMapDirectory memory mapped directories */ private void openDirectory(final EntityConfiguration configuration, final boolean memoryMapDirectory) { try { final URI entityIndexPath = configuration.getLocation(); final File path = new File(entityIndexPath); if (!path.exists()) { return; } if (memoryMapDirectory) { this.directory = MMapDirectory.open(path); } this.directory = FSDirectory.open(path); } catch (final IOException e) { throw new StepInternalException("Unable to read directory", e); } } @Override public EntityDoc[] searchExactTermBySingleField(final String fieldName, final int max, final String... values) { final Query query = getQuery(fieldName, values); return search(query, max, null, null); } @Override public EntityDoc[] searchUniqueBySingleField(final String fieldName, final String... values) { final Query query = getQuery(fieldName, values); return search(query, values.length, null, null); } @Override public EntityDoc[] search(final String[] fieldNames, final String value) { return search(fieldNames, value, null, null, false, null, null); } @Override public EntityDoc[] search(final String[] fieldNames, final String value, final Sort sort) { return search(fieldNames, value, null, sort, false, null, null); } @Override public EntityDoc[] search(final String[] fieldNames, final String value, final Filter filter, final Sort sort, final boolean analyzePrefix) { return search(fieldNames, value, filter, sort, analyzePrefix, null, null); } @Override public EntityDoc[] search(final String[] fieldNames, final String value, final Filter strongFilter, final Sort transliterationSort, final boolean analyzePrefix, final Integer maxResults) { return search(fieldNames, value, strongFilter, transliterationSort, analyzePrefix, null, maxResults); } @Override public EntityDoc[] search(final String[] fieldNames, final String value, final Filter filter, final Sort sort, final boolean analyzePrefix, final String queryRemainder) { return search(fieldNames, value, filter, sort, analyzePrefix, queryRemainder, null); } @Override public EntityDoc[] search(final String[] fieldNames, final String value, final Filter filter, final Sort sort, final boolean analyzePrefix, final String queryRemainder, final Integer maxResults) { return search(fieldNames, value, filter, sort, analyzePrefix, queryRemainder, maxResults, true); } @Override public EntityDoc[] search(final String[] fields, final String query, final boolean useOrOperator) { return search(fields, query, null, null, false, null, null, useOrOperator); } @Override public EntityDoc[] search(String[] fields, String query, boolean useOrOperator, Sort sort) { return search(fields, query, null, sort, false, null, null, useOrOperator); } // CHECKSTYLE:OFF @Override public EntityDoc[] search(final String[] fieldNames, final String value, final Filter filter, final Sort sort, final boolean analyzePrefix, final String queryRemainder, final Integer maxResults, final boolean useOrOperatorBetweenValues) { // CHECKSTYLE:ON final AllResultsCollector collector = new AllResultsCollector(); Query parsed = null; QueryParser parser; if (analyzePrefix) { parser = new AnalyzedPrefixSearchQueryParser(LUCENE_30, fieldNames, this.config.getAnalyzerInstance()); } else { parser = new MultiFieldQueryParser(LUCENE_30, fieldNames, this.config.getAnalyzerInstance()); } parser.setDefaultOperator(useOrOperatorBetweenValues ? Operator.OR : Operator.AND); try { if (queryRemainder != null) { final StringBuilder sb = new StringBuilder(value.length() + queryRemainder.length() + 1); sb.append(value); sb.append(' '); sb.append(queryRemainder); parsed = parser.parse(sb.toString()); } else { parsed = parser.parse(value); } LOGGER.debug("Search query is [{}]", parsed); if (sort != null) { final TopFieldDocs search = this.searcher.search(parsed, filter, maxResults == null ? Integer.MAX_VALUE : maxResults, sort); return extractDocIds(search); } else { this.searcher.search(parsed, filter, collector); return extractDocIds(collector); } } catch (final ParseException e) { throw new StepInternalException("Unable to parse query", e); } catch (final IOException e) { throw new StepInternalException( "Unable to search given query: " + parsed != null ? parsed.toString() : "<unknown>", e); } } @Override public Set<String> findSetOfTerms(final boolean exact, String searchTerm, int maxReturned, final String... fieldNames) { return findSetOfTermsWithCounts(exact, false, searchTerm, maxReturned, fieldNames).getTerms(); } @Override public TermsAndMaxCount findSetOfTermsWithCounts(final boolean exact, final boolean trackMax, String searchTerm, int maxReturned, final String... fieldNames) { TermsAndMaxCount hits = new TermsAndMaxCount(); if (fieldNames.length == 0) { hits.setTerms(new HashSet<String>(0)); return hits; } if (fieldNames.length == 1) { return LuceneUtils.getAllTermsPrefixedWith(exact, trackMax, this.searcher, fieldNames[0], searchTerm, maxReturned); } hits.setTerms(new HashSet<String>(32)); for (int ii = 0; ii < fieldNames.length; ii++) { final TermsAndMaxCount termsByField = LuceneUtils.getAllTermsPrefixedWith(exact, trackMax, this.searcher, fieldNames[ii], searchTerm, maxReturned); hits.getTerms().addAll(termsByField.getTerms()); hits.setTotalCount(hits.getTotalCount() + termsByField.getTotalCount()); } //total count, is count - the existing ters hits.setTotalCount(hits.getTotalCount() - hits.getTerms().size()); return hits; } /** * Extracts all the results * * @param results the results that have been collected * @return the results */ private EntityDoc[] extractDocIds(final TopDocs results) { try { final ScoreDoc[] scoreDocs = results.scoreDocs; final EntityDoc[] docs = new EntityDoc[scoreDocs.length]; for (int ii = 0; ii < scoreDocs.length; ii++) { docs[ii] = new EntityDoc(this.searcher.doc(scoreDocs[ii].doc)); } return docs; } catch (final IOException e) { throw new StepInternalException("Unable to extract results", e); } } @Override public EntityDoc[] search(final String defaultField, final String querySyntax) { final QueryParser parser = getQueryParser(defaultField); try { return this.search(parser.parse(querySyntax)); } catch (final ParseException e) { throw new StepInternalException("Unable to parse query " + querySyntax, e); } } @Override public QueryParser getQueryParser(final String defaultField) { return new QueryParser(LUCENE_30, defaultField, getAnalyzer()); } @Override public EntityDoc[] search(final Query query) { final AllResultsCollector collector = new AllResultsCollector(); try { LOGGER.debug("Search query is [{}], with filter [{}]", query); this.searcher.search(query, collector); return extractDocIds(collector); } catch (final IOException e) { throw new StepInternalException("Unable to search", e); } } /** * Extracts the query results into an entity doc * * @param collector the collector with the results * @return all the entity documents */ private EntityDoc[] extractDocIds(final AllResultsCollector collector) { try { final List<Integer> docIds = collector.getDocIds(); final EntityDoc[] docs = new EntityDoc[docIds.size()]; for (int ii = 0; ii < docIds.size(); ii++) { docs[ii] = new EntityDoc(this.searcher.doc(docIds.get(ii))); } return docs; } catch (final IOException e) { throw new StepInternalException("Unable to extract results from query", e); } } @Override public EntityDoc[] search(final Query query, final int max, final Sort sortField, final Filter filter) { LOGGER.debug("Search query is [{}]", query); try { final TopDocs search; if (sortField != null) { search = this.searcher.search(query, filter, max, sortField); } else { search = this.searcher.search(query, filter, max); } final EntityDoc[] results = new EntityDoc[search.scoreDocs.length]; for (int ii = 0; ii < search.scoreDocs.length; ii++) { results[ii] = new EntityDoc(this.searcher.doc(search.scoreDocs[ii].doc)); } return results; } catch (final IOException e) { throw new StepInternalException("Failed to search", e); } } /** * Returns a query that matches the provided terms * * @param fieldName the field name * @param values the values passed in * @return query */ private Query getQuery(final String fieldName, final String... values) { if (values.length == 1) { return new TermQuery(new Term(fieldName, values[0])); } final Term t = new Term(fieldName); final Term[] ts = new Term[values.length]; for (int ii = 0; ii < ts.length; ii++) { ts[ii] = t.createTerm(values[ii]); } final BooleanQuery booleanQuery = new BooleanQuery(); for (final Term term : ts) { booleanQuery.add(new TermQuery(term), Occur.SHOULD); } return booleanQuery; } @Override public EntityDoc[] searchSingleColumn(final String fieldName, final String querySyntax, final Filter filter) { return searchSingleColumn(fieldName, querySyntax, Operator.OR, false, null, filter); } @Override public EntityDoc[] searchSingleColumn(final String fieldName, final String querySyntax, final Sort sort) { return searchSingleColumn(fieldName, querySyntax, Operator.OR, false, sort); } @Override public EntityDoc[] searchSingleColumn(final String fieldName, final String querySyntax, final Operator op, final boolean allowLeadingWildcard) { return searchSingleColumn(fieldName, querySyntax, op, allowLeadingWildcard, null); } @Override public EntityDoc[] searchSingleColumn(final String fieldName, final String query, final boolean useOrOperator, Sort sort) { return searchSingleColumn(fieldName, query, useOrOperator ? Operator.OR : Operator.AND, false, sort); } @Override public EntityDoc[] searchSingleColumn(final String fieldName, final String querySyntax) { return searchSingleColumn(fieldName, querySyntax, Operator.OR, false); } @Override public EntityDoc[] searchSingleColumn(final String fieldName, final String querySyntax, final Operator op, final boolean allowLeadingWildcard, final Sort sort) { return searchSingleColumn(fieldName, querySyntax, op, allowLeadingWildcard, sort, null); } @Override public EntityDoc[] searchSingleColumn(final String fieldName, final String querySyntax, final Operator op, final boolean allowLeadingWildcard, final Sort sort, final Filter filter) { final QueryParser parser = new QueryParser(LUCENE_30, fieldName, this.getAnalyzer()); parser.setDefaultOperator(op); parser.setAllowLeadingWildcard(allowLeadingWildcard); try { final Query query = parser.parse(querySyntax); return search(query, Integer.MAX_VALUE, sort, filter); } catch (final ParseException e) { throw new StepInternalException("Unable to parse query", e); } } @Override public List<String> getAnalyzedTokens(final String fieldName, final String input, boolean escapeToken) { final TokenStream tokens = this.getAnalyzer().tokenStream(fieldName, new StringReader(input)); //construct query to search for both stepGloss and translations - the last word gets a trailing wildcard //query will be in the form of +(gloss:a trans:a) +(gloss:b* trans:b*) +strong:H* List<String> tokenItems = new ArrayList<String>(2); try { tokens.reset(); TermAttribute termAttribute = tokens.getAttribute(TermAttribute.class); while (tokens.incrementToken()) { String term = termAttribute.term(); if (escapeToken) { term = QueryParser.escape(term); } tokenItems.add(term); } } catch (IOException e) { throw new StepInternalException("Unable to parse query", e); } finally { try { tokens.end(); tokens.close(); } catch (IOException e) { LOGGER.trace("Unable to properly close stream."); } } return tokenItems; } @Override public EntityDoc[] search(BooleanQuery query, Filter filter, TopFieldCollector collector) { try { this.searcher.search(query, filter, collector); return extractDocIds(collector.topDocs()); } catch (IOException e) { throw new StepInternalException(e.getMessage(), e); } } @Override public QueryParser getQueryParser(final boolean analyzePrefix, final boolean useOrOperatorBetweenValues, final String... defaultFields) { QueryParser parser; if (analyzePrefix) { parser = new AnalyzedPrefixSearchQueryParser(LUCENE_30, defaultFields, this.config.getAnalyzerInstance()); } else if (defaultFields.length == 1) { parser = new QueryParser(LUCENE_30, defaultFields[0], this.config.getAnalyzerInstance()); } else { parser = new MultiFieldQueryParser(LUCENE_30, defaultFields, this.config.getAnalyzerInstance()); } parser.setDefaultOperator(useOrOperatorBetweenValues ? Operator.OR : Operator.AND); return parser; } /** * @param searcher the searcher to set */ void setSearcher(final IndexSearcher searcher) { this.searcher = searcher; } }