/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.util.Arrays; import java.util.Comparator; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; /** * Filter outputs a single token which is a concatenation of the sorted and * de-duplicated set of input tokens. This can be useful for clustering/linking * use cases. */ public class FingerprintFilter extends TokenFilter { public static final int DEFAULT_MAX_OUTPUT_TOKEN_SIZE = 1024; public static final char DEFAULT_SEPARATOR = ' '; private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private CharArraySet uniqueTerms = null; private final int maxOutputTokenSize; private AttributeSource.State finalState; private final char separator; private boolean inputEnded = false; /** * Create a new FingerprintFilter with default settings */ public FingerprintFilter(TokenStream input) { this(input, DEFAULT_MAX_OUTPUT_TOKEN_SIZE, DEFAULT_SEPARATOR); } /** * Create a new FingerprintFilter with control over all settings * * @param input * the source of tokens to be summarized into a single token * @param maxOutputTokenSize * the maximum length of the summarized output token. If exceeded, no * output token is emitted * @param separator * the character used to separate tokens combined into the single * output token */ public FingerprintFilter(TokenStream input, int maxOutputTokenSize, char separator) { super(input); this.maxOutputTokenSize = maxOutputTokenSize; this.separator = separator; } @Override public final boolean incrementToken() throws IOException { if (uniqueTerms != null) { // We have already built the single output token - there's no more return false; } boolean result = buildSingleOutputToken(); finalState = captureState(); return result; } /** * Gathers all tokens from input, de-duplicates, sorts then concatenates. * * @return false for end of stream; true otherwise */ private final boolean buildSingleOutputToken() throws IOException { inputEnded = false; char clonedLastTerm[] = null; uniqueTerms = new CharArraySet(8, false); int outputTokenSize = 0; while (input.incrementToken()) { if (outputTokenSize > maxOutputTokenSize) { continue; } final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); if (!uniqueTerms.contains(term, 0, length)) { // clone the term, and add to the set of seen terms. clonedLastTerm = new char[length]; System.arraycopy(term, 0, clonedLastTerm, 0, length); if (uniqueTerms.size() > 0) { outputTokenSize++; //Add 1 for the separator char we will output } uniqueTerms.add(clonedLastTerm); outputTokenSize += length; } } //Force end-of-stream operations to get the final state. input.end(); inputEnded = true; //Gathering complete - now output exactly zero or one token: //Set the attributes for the single output token offsetAtt.setOffset(0, offsetAtt.endOffset()); posLenAtt.setPositionLength(1); posIncrAtt.setPositionIncrement(1); typeAtt.setType("fingerprint"); //No tokens gathered - no output if (uniqueTerms.size() < 1) { termAttribute.setEmpty(); return false; } //Tokens gathered are too large - no output if (outputTokenSize > maxOutputTokenSize) { termAttribute.setEmpty(); uniqueTerms.clear(); return false; } // Special case - faster option when we have a single token if (uniqueTerms.size() == 1) { termAttribute.setEmpty().append(new String(clonedLastTerm)); uniqueTerms.clear(); return true; } // Sort the set of deduplicated tokens and combine Object[] items = uniqueTerms.toArray(); Arrays.sort(items, new Comparator<Object>() { @Override public int compare(Object o1, Object o2) { char v1[] = (char[]) o1; char v2[] = (char[]) o2; int len1 = v1.length; int len2 = v2.length; int lim = Math.min(len1, len2); int k = 0; while (k < lim) { char c1 = v1[k]; char c2 = v2[k]; if (c1 != c2) { return c1 - c2; } k++; } return len1 - len2; } }); StringBuilder sb = new StringBuilder(); for (Object item : items) { if (sb.length() >= 1) { sb.append(separator); } sb.append((char[]) item); } termAttribute.setEmpty().append(sb); uniqueTerms.clear(); return true; } @Override public final void end() throws IOException { if (!inputEnded) { // Rare case - If an IOException occurs while performing buildSingleOutputToken // we may not have called input.end() already input.end(); inputEnded = true; } if (finalState != null) { restoreState(finalState); } } /** * {@inheritDoc} */ @Override public void reset() throws IOException { super.reset(); inputEnded = false; uniqueTerms = null; } }