package org.apache.lucene.analysis; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RollingBuffer; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; // TODO: maybe also toFST? then we can translate atts into FST outputs/weights /** Consumes a TokenStream and creates an {@link Automaton} * where the transition labels are UTF8 bytes (or Unicode * code points if unicodeArcs is true) from the {@link * TermToBytesRefAttribute}. Between tokens we insert * POS_SEP and for holes we insert HOLE. * * @lucene.experimental */ public class TokenStreamToAutomaton { private boolean preservePositionIncrements; private boolean unicodeArcs; /** Sole constructor. */ public TokenStreamToAutomaton() { this.preservePositionIncrements = true; } /** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */ public void setPreservePositionIncrements(boolean enablePositionIncrements) { this.preservePositionIncrements = enablePositionIncrements; } /** Whether to make transition labels Unicode code points instead of UTF8 bytes, * <code>false</code> by default */ public void setUnicodeArcs(boolean unicodeArcs) { this.unicodeArcs = unicodeArcs; } private static class Position implements RollingBuffer.Resettable { // Any tokens that ended at our position arrive to this state: State arriving; // Any tokens that start at our position leave from this state: State leaving; @Override public void reset() { arriving = null; leaving = null; } } private static class Positions extends RollingBuffer<Position> { @Override protected Position newInstance() { return new Position(); } } /** Subclass & implement this if you need to change the * token (such as escaping certain bytes) before it's * turned into a graph. */ protected BytesRef changeToken(BytesRef in) { return in; } /** We create transition between two adjacent tokens. */ public static final int POS_SEP = 0x001f; /** We add this arc to represent a hole. */ public static final int HOLE = 0x001e; /** Pulls the graph (including {@link * PositionLengthAttribute}) from the provided {@link * TokenStream}, and creates the corresponding * automaton where arcs are bytes (or Unicode code points * if unicodeArcs = true) from each term. */ public Automaton toAutomaton(TokenStream in) throws IOException { final Automaton a = new Automaton(); boolean deterministic = true; final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class); final BytesRef term = termBytesAtt.getBytesRef(); in.reset(); // Only temporarily holds states ahead of our current // position: final RollingBuffer<Position> positions = new Positions(); int pos = -1; Position posData = null; int maxOffset = 0; while (in.incrementToken()) { int posInc = posIncAtt.getPositionIncrement(); if (!preservePositionIncrements && posInc > 1) { posInc = 1; } assert pos > -1 || posInc > 0; if (posInc > 0) { // New node: pos += posInc; posData = positions.get(pos); assert posData.leaving == null; if (posData.arriving == null) { // No token ever arrived to this position if (pos == 0) { // OK: this is the first token posData.leaving = a.getInitialState(); } else { // This means there's a hole (eg, StopFilter // does this): posData.leaving = new State(); addHoles(a.getInitialState(), positions, pos); } } else { posData.leaving = new State(); posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); if (posInc > 1) { // A token spanned over a hole; add holes // "under" it: addHoles(a.getInitialState(), positions, pos); } } positions.freeBefore(pos); } else { // note: this isn't necessarily true. its just that we aren't surely det. // we could optimize this further (e.g. buffer and sort synonyms at a position) // but thats probably overkill. this is cheap and dirty deterministic = false; } final int endPos = pos + posLengthAtt.getPositionLength(); termBytesAtt.fillBytesRef(); final BytesRef termUTF8 = changeToken(term); int[] termUnicode = null; final Position endPosData = positions.get(endPos); if (endPosData.arriving == null) { endPosData.arriving = new State(); } State state = posData.leaving; int termLen; if (unicodeArcs) { final String utf16 = termUTF8.utf8ToString(); termUnicode = new int[utf16.codePointCount(0, utf16.length())]; termLen = termUnicode.length; for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) termUnicode[j++] = cp = utf16.codePointAt(i); } else { termLen = termUTF8.length; } for(int byteIDX=0;byteIDX<termLen;byteIDX++) { final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State(); int c; if (unicodeArcs) { c = termUnicode[byteIDX]; } else { c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff; } state.addTransition(new Transition(c, nextState)); state = nextState; } maxOffset = Math.max(maxOffset, offsetAtt.endOffset()); } in.end(); State endState = null; if (offsetAtt.endOffset() > maxOffset) { endState = new State(); endState.setAccept(true); } pos++; while (pos <= positions.getMaxPos()) { posData = positions.get(pos); if (posData.arriving != null) { if (endState != null) { posData.arriving.addTransition(new Transition(POS_SEP, endState)); } else { posData.arriving.setAccept(true); } } pos++; } //toDot(a); a.setDeterministic(deterministic); return a; } // for debugging! /* private static void toDot(Automaton a) throws IOException { final String s = a.toDot(); Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot")); w.write(s); w.close(); System.out.println("TEST: saved to /tmp/out.dot"); } */ private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) { Position posData = positions.get(pos); Position prevPosData = positions.get(pos-1); while(posData.arriving == null || prevPosData.leaving == null) { if (posData.arriving == null) { posData.arriving = new State(); posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); } if (prevPosData.leaving == null) { if (pos == 1) { prevPosData.leaving = startState; } else { prevPosData.leaving = new State(); } if (prevPosData.arriving != null) { prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving)); } } prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving)); pos--; if (pos <= 0) { break; } posData = prevPosData; prevPosData = positions.get(pos-1); } } }