package org.apache.lucene.search.concordance.classic;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer;
import org.apache.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil;
import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException;
import org.apache.lucene.search.concordance.classic.impl.DefaultSortKeyBuilder;
import org.apache.lucene.search.concordance.classic.impl.FieldBasedDocIdBuilder;
import org.apache.lucene.search.concordance.classic.impl.IndexIdDocIdBuilder;
import org.apache.lucene.search.concordance.classic.impl.SimpleDocMetadataExtractor;
/**
* Builds a ConcordanceWindow.
* <p>
* This class includes basic functionality for building a window from token offsets.
* <p>
* It also calls three other components:
* <ol>
* <li>DocIdBuilder - extracts or builds a unique key for each document</li>
* <li>DocMetadataExtractor - extracts metadata from a document to be stored with each window</li>
* <li>SortKeyBuilder - builds a window's sort key</li>
* </ol>
*/
public class WindowBuilder {
private final static String EMPTY_STRING = "";
private static final String INTER_MULTIVALUE_FIELD_PADDING = " | ";
private final int tokensBefore;
private final int tokensAfter;
private final SortKeyBuilder sortKeyBuilder;
private final DocMetadataExtractor metadataExtractor;
private final DocIdBuilder docIdBuilder;
private final int offsetGap;
public WindowBuilder() {
this(
10, //tokens before
10, //tokens after
0,
new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE),
new SimpleDocMetadataExtractor(),
new IndexIdDocIdBuilder()
);
}
public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap) {
this(
tokensBefore,
tokensAfter,
offsetGap,
new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE),
new SimpleDocMetadataExtractor(),
new IndexIdDocIdBuilder()
);
}
public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap, SortKeyBuilder sortKeyBuilder,
DocMetadataExtractor metadataExtractor, DocIdBuilder docIdBuilder) {
this.tokensBefore = tokensBefore;
this.tokensAfter = tokensAfter;
this.offsetGap = offsetGap;
this.sortKeyBuilder = sortKeyBuilder;
this.metadataExtractor = metadataExtractor;
this.docIdBuilder = docIdBuilder;
}
/**
*
/**
* Makes the assumption that the target token start and target token end can
* be found. If not, this returns a null.
*
* @param uniqueDocID ephemeral internal lucene unique document id
* @param targetTokenStart Target's start token
* @param targetTokenEnd Target's end token
* @param fieldValues field values
* @param metadata Metadata to be stored with the window
* @param offsets TokenOffsetResults from
* @return ConcordanceWindow or null if character offset information cannot be
* found for both the targetTokenStart and the targetTokenEnd
* @throws TargetTokenNotFoundException if target token cannot be found
* @throws IllegalArgumentException if the start token comes after the end token, e.g.
*/
public ConcordanceWindow buildConcordanceWindow(String uniqueDocID,
int targetTokenStart, int targetTokenEnd,
String[] fieldValues,
RandomAccessCharOffsetContainer offsets,
Map<String, String> metadata)
throws TargetTokenNotFoundException,
IllegalArgumentException {
if (targetTokenStart < 0 || targetTokenEnd < 0) {
throw new IllegalArgumentException(
"targetTokenStart and targetTokenEnd must be >= 0");
}
if (targetTokenEnd < targetTokenStart) {
throw new IllegalArgumentException(
"targetTokenEnd must be >= targetTokenStart");
}
int targetCharStart = offsets.getCharacterOffsetStart(targetTokenStart);
int targetCharEnd = offsets.getCharacterOffsetEnd(targetTokenEnd);
if (targetCharStart < 0 ||
targetCharEnd < 0) {
throw new TargetTokenNotFoundException(
"couldn't find character offsets for a target token.\n"
+ "Check that your analyzers are configured properly.\n");
}
OffsetAttribute preCharOffset = getPreCharOffset(targetTokenStart,
targetCharStart, offsets);
String preString = (preCharOffset == null) ? EMPTY_STRING :
SimpleAnalyzerUtil.substringFromMultiValuedFields(
preCharOffset.startOffset(), preCharOffset.endOffset(), fieldValues,
offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
OffsetAttribute postCharOffset = getPostCharOffset(targetTokenEnd,
targetCharEnd, offsets);
String postString = (postCharOffset == null) ? EMPTY_STRING :
SimpleAnalyzerUtil.substringFromMultiValuedFields(
postCharOffset.startOffset(), postCharOffset.endOffset(), fieldValues,
offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
String targString = SimpleAnalyzerUtil.substringFromMultiValuedFields(
targetCharStart, targetCharEnd, fieldValues,
offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
ConcordanceSortKey sortKey = sortKeyBuilder.buildKey(uniqueDocID,
targetTokenStart, targetTokenEnd, offsets, tokensBefore, tokensAfter, metadata);
int charStart = (preCharOffset == null) ? targetCharStart :
preCharOffset.startOffset();
int charEnd = (postCharOffset == null) ? targetCharEnd : postCharOffset.endOffset();
return new ConcordanceWindow(uniqueDocID, charStart, charEnd, preString, targString,
postString, sortKey, metadata);
}
private OffsetAttribute getPreCharOffset(int targetTokenStart,
int targetCharStart,
RandomAccessCharOffsetContainer charOffsets) {
if (tokensBefore == 0)
return null;
if (targetTokenStart == 0) {
return null;
}
int contextTokenStart = Math.max(0,
targetTokenStart - tokensBefore);
int contextCharStart = charOffsets.getClosestCharStart(contextTokenStart, targetTokenStart);
//closest start wasn't actually found
//this can happen if there is a large posInc and the target
//lands at the start of a field index
if (contextCharStart < 0) {
return null;
}
int contextCharEnd = Math.max(contextCharStart, targetCharStart - 1);
return buildOffsetAttribute(contextCharStart, contextCharEnd);
}
private OffsetAttribute getPostCharOffset(int targetTokenEnd,
int targetCharEnd,
RandomAccessCharOffsetContainer charOffsets) {
if (tokensAfter == 0)
return null;
int contextTokenEnd = targetTokenEnd + tokensAfter;
int contextCharEnd = charOffsets.getClosestCharEnd(
contextTokenEnd, targetTokenEnd + 1);
if (targetCharEnd >= contextCharEnd) {
return null;
}
return buildOffsetAttribute(targetCharEnd, contextCharEnd);
}
private OffsetAttribute buildOffsetAttribute(int start, int end) {
OffsetAttribute off = new OffsetAttributeImpl();
off.setOffset(start, end);
return off;
}
public Set<String> getFieldSelector() {
Set<String> set = new HashSet<>();
set.addAll(metadataExtractor.getFieldSelector());
if (docIdBuilder instanceof FieldBasedDocIdBuilder) {
set.addAll(((FieldBasedDocIdBuilder) docIdBuilder).getFields());
}
return set;
}
/**
* Simple wrapper around metadataExtractor
*
* @param document document from which to extract metadata
* @return map
*/
public Map<String, String> extractMetadata(Document document) {
return metadataExtractor.extract(document);
}
public String getUniqueDocumentId(Document document, long docId) {
return docIdBuilder.build(document, docId);
}
public int getTokensBefore() {
return tokensBefore;
}
public int getTokensAfter() {
return tokensAfter;
}
}