package org.apache.lucene.search.concordance.charoffsets;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* Simple util class for Analyzers
*/
public class SimpleAnalyzerUtil {
private final static String DEFAULT_FIELD = "FIELD";
/**
*
* @param s string to analyze
* @param field field to analyze
* @param analyzer analyzer to use
* @return list of analyzed terms
* @throws IOException if there's an IOException during analysis
*/
public static List<String> getTermStrings(String s, String field, Analyzer analyzer)
throws IOException {
List<String> terms = new ArrayList<>();
return getTermStrings(s, field, analyzer, terms);
}
/**
* allows reuse of terms, this method calls terms.clear() before adding new
* terms
*
* @param s string to analyze
* @param field to use in analysis
* @param analyzer analyzer
* @param terms list for reuse
* @return list of strings
* @throws java.io.IOException if there's an IOException during analysis
*/
private static List<String> getTermStrings(String s, String field, Analyzer analyzer,
List<String> terms) throws IOException {
if (terms == null) {
terms = new ArrayList<>();
}
terms.clear();
TokenStream stream = analyzer.tokenStream(field, s);
stream.reset();
CharTermAttribute termAtt = stream
.getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
while (stream.incrementToken()) {
terms.add(termAtt.toString());
}
stream.end();
stream.close();
return terms;
}
/**
* This calculates a substring from an array of StorableFields.
* <p>
* This attempts to do the best job possible, and at worst will
* return an empty string. If the start or end is within a gap,
* or before 0 or after the total number of characters, this will
* gracefully (blithely?) handle those cases.
*
* @param start character offset to start
* @param end character offset to end
* @param fieldValues array of Strings to process
* @param offsetGap offsetGap as typically returned by Analyzer's .getOffsetGap()
* @param interFieldJoiner string to use to mark that a substring goes beyond a single
* field entry
* @return substring, potentially empty, never null.
*/
public static String substringFromMultiValuedFields(int start,
int end, String[] fieldValues, int offsetGap, String interFieldJoiner) {
start = (start < 0) ? 0 : start;
end = (end < 0) ? 0 : end;
if (start > end) {
start = end;
}
int charBase = 0;
StringBuilder sb = new StringBuilder();
int lastFieldIndex = 0;
int localStart = 0;
boolean foundStart = false;
//get start
for (int fieldIndex = 0; fieldIndex < fieldValues.length; fieldIndex++) {
String fString = fieldValues[fieldIndex];
if (start < charBase + fString.length()) {
localStart = start - charBase;
lastFieldIndex = fieldIndex;
foundStart = true;
break;
}
charBase += fString.length() + offsetGap;
}
if (!foundStart) {
return "";
}
//if start occurred in a gap, reset localStart to 0
if (localStart < 0) {
sb.append(interFieldJoiner);
localStart = 0;
}
//now append and look for end
for (int fieldIndex = lastFieldIndex; fieldIndex < fieldValues.length; fieldIndex++) {
String fString = fieldValues[fieldIndex];
if (end <= charBase + fString.length()) {
int localEnd = end - charBase;
//must be in gap
if (charBase > end) {
return sb.toString();
}
if (fieldIndex != lastFieldIndex) {
sb.append(interFieldJoiner);
}
sb.append(fString.substring(localStart, localEnd));
break;
} else {
if (fieldIndex != lastFieldIndex) {
sb.append(interFieldJoiner);
}
sb.append(fString.substring(localStart));
localStart = 0;
}
charBase += fString.length() + offsetGap;
}
return sb.toString();
}
}