package org.apache.lucene.search.concordance.classic; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsets; import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsetsVisitor; import org.apache.lucene.search.concordance.charoffsets.OffsetLengthStartComparator; import org.apache.lucene.search.concordance.charoffsets.OffsetUtil; import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer; import org.apache.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader; import org.apache.lucene.search.concordance.charoffsets.SpansCrawler; import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException; import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests; import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetsReader; import org.apache.lucene.search.concordance.util.ConcordanceSearcherUtil; import org.apache.lucene.search.spans.SimpleSpanQueryConverter; import org.apache.lucene.search.spans.SpanQuery; /** * Searches an IndexReader and returns a list of ConcordanceWindows */ public class ConcordanceSearcher { /** * Allow overlapping targets in hits, default = false */ private boolean allowTargetOverlaps = false; private final WindowBuilder windowBuilder; private SimpleSpanQueryConverter spanQueryConverter; /** * Constructor with default WindowBuilder and SimpleSpanQueryConverter */ public ConcordanceSearcher() { this(new WindowBuilder(), new SimpleSpanQueryConverter()); } /** * Constructor for windowbuilder and SimpleSpanQueryConverter * * @param windowBuilder window builder */ public ConcordanceSearcher(WindowBuilder windowBuilder) { this(windowBuilder, new SimpleSpanQueryConverter()); } /** * Constructor for windowBuilder and converter * * @param windowBuilder windowBuilder to use to build windows * @param converter converter to use to convert Query to SpanQuery */ private ConcordanceSearcher(WindowBuilder windowBuilder, SimpleSpanQueryConverter converter) { this.windowBuilder = windowBuilder; this.spanQueryConverter = converter; } /** * @param searcher searcher to search * @param fieldName field to build the windows on * @param mainQuery if SpanQuery, this gets passed through as is. If a regular Query, the * Query is first converted to a SpanQuery and the filterQuery is modified * to include the original Query. * @param filterQuery include a filterQuery mainQuery. Value can be null * @param analyzer analyzer to use for (re)calculating character offsets and for normalizing * the sort keys * @param collector collector to use for search * @throws TargetTokenNotFoundException if target token is not found * @throws IllegalArgumentException if the field can't be found in the main query * @throws java.io.IOException if there is an underlying IOException in the reader */ public void search(IndexSearcher searcher, String fieldName, Query mainQuery, Query filterQuery, Analyzer analyzer, AbstractConcordanceWindowCollector collector) throws TargetTokenNotFoundException, IllegalArgumentException, IOException { if (mainQuery == null) { return; } if (mainQuery instanceof SpanQuery) { // pass through searchSpan(searcher, (SpanQuery) mainQuery, filterQuery, analyzer, collector); } else { // convert regular mainQuery to a SpanQuery. SpanQuery spanQuery = spanQueryConverter.convert(fieldName, mainQuery); Query updatedFilter = mainQuery; if (filterQuery != null) { updatedFilter = new BooleanQuery.Builder() .add(mainQuery, BooleanClause.Occur.MUST) .add(filterQuery, BooleanClause.Occur.FILTER).build(); } searchSpan(searcher, spanQuery, updatedFilter, analyzer, collector); } } /** * Like * {@link #search(IndexSearcher, String, Query, Query, Analyzer, AbstractConcordanceWindowCollector)} * but this takes a SpanQuery * * @param searcher searcher * @param spanQuery query to use to identify the targets * @param filter filter for document retrieval * @param analyzer to re-analyze terms for window calculations and sort key building * @param collector to process (and store) the results * @throws TargetTokenNotFoundException if target token is not found * @throws IllegalArgumentException if the field can't be found in the main query * @throws java.io.IOException if there is an underlying IOException in the reader */ private void searchSpan(IndexSearcher searcher, SpanQuery spanQuery, Query filter, Analyzer analyzer, AbstractConcordanceWindowCollector collector) throws TargetTokenNotFoundException, IllegalArgumentException, IOException { spanQuery = (SpanQuery) spanQuery.rewrite(searcher.getIndexReader()); Set<String> fields = new HashSet<>( windowBuilder.getFieldSelector()); fields.add(spanQuery.getField()); DocTokenOffsetsVisitor visitor = new ConcDTOffsetVisitor(spanQuery.getField(), analyzer, fields, collector); SpansCrawler.crawl(spanQuery, filter, searcher, visitor); collector.setTotalDocs(searcher.getIndexReader().numDocs()); } /** * Spans can overlap: a search for ["ab cd" "ab"] would have * two spans on the string "ab cd" if this is set to true. * If this is set to false, this will return the longest span * that appears earliest in the string if there is overlap. * * @param allowTargetOverlaps are targets allowed to overlap. */ public void setAllowTargetOverlaps(boolean allowTargetOverlaps) { this.allowTargetOverlaps = allowTargetOverlaps; } private void throwMissingField(Document document) throws IllegalArgumentException { StringBuilder sb = new StringBuilder(); sb.append("Did you forget to load or specify the correct content field?!"); sb.append("\n"); sb.append("I only see these fields:\n"); for (IndexableField f : document.getFields()) { sb.append(f.name()).append("\n"); } throw new IllegalArgumentException(sb.toString()); } /** * Set the converter to use to convert a Query to a SpanQuery. * The need for this will go away when LUCENE-2878 is completed. * * @param converter converter to use to convert queries into SpanQueries */ public void setSpanQueryConverter(SimpleSpanQueryConverter converter) { this.spanQueryConverter = converter; } class ConcDTOffsetVisitor implements DocTokenOffsetsVisitor { final Set<String> fields; final DocTokenOffsets docTokenOffsets = new DocTokenOffsets(); final Analyzer analyzer; final String fieldName; final AbstractConcordanceWindowCollector collector; final TokenCharOffsetRequests requests = new TokenCharOffsetRequests(); final TokenCharOffsetsReader tokenOffsetsRecordReader; final RandomAccessCharOffsetContainer offsetResults = new RandomAccessCharOffsetContainer(); final OffsetLengthStartComparator offsetLengthStartComparator = new OffsetLengthStartComparator(); ConcDTOffsetVisitor(String fieldName, Analyzer analyzer, Set<String> fields, AbstractConcordanceWindowCollector collector) { this.fieldName = fieldName; this.analyzer = analyzer; this.fields = fields; this.collector = collector; tokenOffsetsRecordReader = new ReanalyzingTokenCharOffsetsReader(analyzer); } @Override public DocTokenOffsets getDocTokenOffsets() { return docTokenOffsets; } @Override public Set<String> getFields() { return fields; } @Override public boolean visit(DocTokenOffsets docTokenOffsets) throws IOException { Document document = docTokenOffsets.getDocument(); String[] fieldValues = document.getValues(fieldName); if (fieldValues == null || fieldValues.length == 0) { throwMissingField(document); } Map<String, String> metadata = windowBuilder.extractMetadata(document); String docId = windowBuilder.getUniqueDocumentId(document, docTokenOffsets.getUniqueDocId()); List<OffsetAttribute> tokenOffsets = docTokenOffsets.getOffsets(); if (!allowTargetOverlaps) { // remove overlapping hits!!! tokenOffsets = OffsetUtil.removeOverlapsAndSort(tokenOffsets, offsetLengthStartComparator, null); } //clear then get new requests requests.clear(); ConcordanceSearcherUtil.getCharOffsetRequests(tokenOffsets, windowBuilder.getTokensBefore(), windowBuilder.getTokensAfter(), requests); offsetResults.clear(); tokenOffsetsRecordReader.getTokenCharOffsetResults( document, fieldName, requests, offsetResults); for (OffsetAttribute offset : tokenOffsets) { try { ConcordanceWindow w = windowBuilder.buildConcordanceWindow( docId, offset.startOffset(), offset.endOffset() - 1, fieldValues, offsetResults, metadata); collector.collect(w); } catch (TargetTokenNotFoundException e) { throw new IllegalArgumentException(e); } if (collector.getHitMax()) { return false; } } return true; } } }