/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.sinks; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.AttributeSource; /** * This TokenFilter provides the ability to set aside attribute states that have already been analyzed. This is useful * in situations where multiple fields share many common analysis steps and then go their separate ways. * * <p> * It is also useful for doing things like entity extraction or proper noun analysis as part of the analysis workflow * and saving off those tokens for use in another field. * </p> * * <pre class="prettyprint"> * TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer()); * TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream(); * TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream(); * * TokenStream final1 = new LowerCaseFilter(source1); * TokenStream final2 = new EntityDetect(sink1); * TokenStream final3 = new URLDetect(sink2); * * d.add(new TextField("f1", final1)); * d.add(new TextField("f2", final2)); * d.add(new TextField("f3", final3)); * </pre> * * <p> * In this example, {@code sink1} and {@code sink2} will both get tokens from {@code source1} after whitespace * tokenization, and will further do additional token filtering, e.g. detect entities and URLs. * </p> * * <p> * <b>NOTE</b>: it is important, that tees are consumed before sinks, therefore you should add them to the document * before the sinks. In the above example, <i>f1</i> is added before the other fields, and so by the time they are * processed, it has already been consumed, which is the correct way to index the three streams. If for some reason you * cannot ensure that, you should call {@link #consumeAllTokens()} before adding the sinks to document fields. */ public final class TeeSinkTokenFilter extends TokenFilter { private final States cachedStates = new States(); public TeeSinkTokenFilter(TokenStream input) { super(input); } /** Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream. */ public TokenStream newSinkTokenStream() { return new SinkTokenStream(this.cloneAttributes(), cachedStates); } /** * <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks when itself is consumed. To be sure that all * tokens from the input stream are passed to the sinks, you can call this methods. This instance is exhausted after * this method returns, but all sinks are instant available. */ public void consumeAllTokens() throws IOException { while (incrementToken()) {} } @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { cachedStates.add(captureState()); return true; } return false; } @Override public final void end() throws IOException { super.end(); cachedStates.setFinalState(captureState()); } @Override public void reset() throws IOException { cachedStates.reset(); super.reset(); } /** TokenStream output from a tee. */ public static final class SinkTokenStream extends TokenStream { private final States cachedStates; private Iterator<AttributeSource.State> it = null; private SinkTokenStream(AttributeSource source, States cachedStates) { super(source); this.cachedStates = cachedStates; } @Override public final boolean incrementToken() { if (!it.hasNext()) { return false; } AttributeSource.State state = it.next(); restoreState(state); return true; } @Override public void end() throws IOException { State finalState = cachedStates.getFinalState(); if (finalState != null) { restoreState(finalState); } } @Override public final void reset() { it = cachedStates.getStates(); } } /** A convenience wrapper for storing the cached states as well the final state of the stream. */ private static final class States { private final List<State> states = new ArrayList<>(); private State finalState; public States() {} void setFinalState(State finalState) { this.finalState = finalState; } State getFinalState() { return finalState; } void add(State state) { states.add(state); } Iterator<State> getStates() { return states.iterator(); } void reset() { finalState = null; states.clear(); } } }