package org.apache.lucene.search.concordance.windowvisitor; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsets; import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsetsVisitor; import org.apache.lucene.search.concordance.charoffsets.OffsetLengthStartComparator; import org.apache.lucene.search.concordance.charoffsets.OffsetUtil; import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer; import org.apache.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader; import org.apache.lucene.search.concordance.charoffsets.SpansCrawler; import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException; import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests; import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetsReader; import org.apache.lucene.search.concordance.classic.DocIdBuilder; import org.apache.lucene.search.concordance.util.ConcordanceSearcherUtil; import org.apache.lucene.search.spans.SimpleSpanQueryConverter; import org.apache.lucene.search.spans.SpanQuery; /** * Calculates term statistics for the tokens before and after a given query * term. * <p> * This can be very useful to help users identify synonyms or find patterns in * their data. */ public class ConcordanceArrayWindowSearcher { private boolean allowTargetOverlaps = false; /** * @param searcher indexSearcher to search * @param fieldName field to search * @param mainQuery mainQuery to use * @param filterQuery filterQuery to apply, can be null * @param analyzer analyzer re-analysis text * @param visitor handler for visiting windows * @param docIdBuilder builder for constructing unique document ids * @throws IllegalArgumentException if field not found in query, e.g. * @throws TargetTokenNotFoundException if target token is not found * @throws java.io.IOException if there's an underlying IOException with the reader */ public void search(IndexSearcher searcher, String fieldName, Query mainQuery, Query filterQuery, Analyzer analyzer, ArrayWindowVisitor visitor, DocIdBuilder docIdBuilder) throws IllegalArgumentException, TargetTokenNotFoundException, IOException { if (mainQuery instanceof SpanQuery) { // pass through searchSpan(searcher, (SpanQuery) mainQuery, filterQuery, analyzer, visitor, docIdBuilder); } else { // convert regular mainQuery to a SpanQuery. SimpleSpanQueryConverter converter = new SimpleSpanQueryConverter(); SpanQuery spanQuery = converter.convert(fieldName, mainQuery); Query updatedFilter = mainQuery; if (filterQuery != null) { updatedFilter = new BooleanQuery.Builder() .add(mainQuery, BooleanClause.Occur.MUST) .add(filterQuery, BooleanClause.Occur.FILTER).build(); } searchSpan(searcher, spanQuery, updatedFilter, analyzer, visitor, docIdBuilder); } } private void searchSpan(IndexSearcher searcher, SpanQuery query, Query filterQuery, Analyzer analyzer, ArrayWindowVisitor visitor, DocIdBuilder docIdBuilder) throws IllegalArgumentException, TargetTokenNotFoundException, IOException { String field = query.getField(); //if nothing is found for e.g. a prefix query, the returned query will //be an empty spanquery with a null field. We need to cache the field //in case this is destroyed in the rewrite. query = (SpanQuery) query.rewrite(searcher.getIndexReader()); CAWDocTokenOffsetsVisitor docTokenOffsetsVisitor = new CAWDocTokenOffsetsVisitor(field, analyzer, docIdBuilder, visitor); SpansCrawler.crawl(query, filterQuery, searcher, docTokenOffsetsVisitor); } /** * @param allowTargetOverlaps whether to allow targets to overlap or ignore overlapping * targets */ public void setAllowTargetOverlaps(boolean allowTargetOverlaps) { this.allowTargetOverlaps = allowTargetOverlaps; } private class CAWDocTokenOffsetsVisitor implements DocTokenOffsetsVisitor { final String fieldName; final TokenCharOffsetsReader tokenOffsetsReader; // reusable requests and results final TokenCharOffsetRequests offsetRequests = new TokenCharOffsetRequests(); final RandomAccessCharOffsetContainer offsetResults = new RandomAccessCharOffsetContainer(); final DocTokenOffsets docTokenOffsets = new DocTokenOffsets(); final OffsetLengthStartComparator offsetLengthStartComparator = new OffsetLengthStartComparator(); // reusable arrayWindow final ConcordanceArrayWindow arrayWindow; final ArrayWindowVisitor visitor; final Analyzer analyzer; final DocIdBuilder docIdBuilder; CAWDocTokenOffsetsVisitor(String fieldName, Analyzer analyzer, DocIdBuilder docIdBuilder, ArrayWindowVisitor visitor) { this.fieldName = fieldName; this.analyzer = analyzer; this.docIdBuilder = docIdBuilder; this.visitor = visitor; tokenOffsetsReader = new ReanalyzingTokenCharOffsetsReader(analyzer); arrayWindow = new ConcordanceArrayWindow( analyzer.getPositionIncrementGap(fieldName)); } @Override public DocTokenOffsets getDocTokenOffsets() { return docTokenOffsets; } @Override public Set<String> getFields() { Set<String> fields = new HashSet<>(); fields.add(fieldName); fields.addAll(docIdBuilder.getFields()); return fields; } @Override public boolean visit(DocTokenOffsets docTokenOffsets) throws IOException, TargetTokenNotFoundException { Document document = docTokenOffsets.getDocument(); String docId = docIdBuilder.build(document, docTokenOffsets.getUniqueDocId()); String[] fieldValues = document.getValues(fieldName); if (fieldValues == null) { throw new IOException("Mismatched content field"); } List<OffsetAttribute> offsets = docTokenOffsets.getOffsets(); if (!allowTargetOverlaps) { // remove overlapping hits offsets = OffsetUtil.removeOverlapsAndSort(offsets, offsetLengthStartComparator, null); } // can't imagine that this would ever happen if (offsets.size() == 0) { throw new IllegalArgumentException( "DEBUG: can't imagine that this would ever happen"); // just in case this does happen } // reset and then load offsetRequests offsetRequests.clear(); ConcordanceSearcherUtil.getCharOffsetRequests(offsets, visitor.getTokensBefore(), visitor.getTokensAfter(), offsetRequests); offsetResults.clear(); tokenOffsetsReader.getTokenCharOffsetResults(document, fieldName, offsetRequests, offsetResults); boolean keepGoing = visitWindowsInDoc(offsetResults, fieldValues, offsets, docId, arrayWindow, visitor, analyzer.getOffsetGap(fieldName)); return keepGoing; } private boolean visitWindowsInDoc(RandomAccessCharOffsetContainer offsetResults, String[] fieldValues, List<OffsetAttribute> offsets, String docId, ConcordanceArrayWindow window, ArrayWindowVisitor visitor, int offsetGap) throws IOException, TargetTokenNotFoundException { for (OffsetAttribute offset : offsets) { // hit max, stop now if (visitor.getHitMax()) { return false; } window.reset(); window = ArrayWindowBuilder.buildWindow(offset.startOffset(), offset.endOffset() - 1, visitor.getTokensBefore(), visitor.getTokensAfter(), offsetGap, offsetResults, fieldValues, window, visitor.includeTarget(), visitor.analyzeTarget()); visitor.visit(docId, window); } return true; } } }