/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.fst.Util; import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; public class TestGraphTokenizers extends BaseTokenStreamTestCase { // Makes a graph TokenStream from the string; separate // positions with single space, multiple tokens at the same // position with /, and add optional position length with // :. EG "a b c" is a simple chain, "a/x b c" adds 'x' // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds // 'x' over a with posLen=3. Tokens are in normal-form! // So, offsets are computed based on the first token at a // given position. NOTE: each token must be a single // character! We assume this when computing offsets... // NOTE: all input tokens must be length 1!!! This means // you cannot turn on MockCharFilter when random // testing... private static class GraphTokenizer extends Tokenizer { private List<Token> tokens; private int upto; private int inputLength; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); @Override public void reset() throws IOException { super.reset(); tokens = null; upto = 0; } @Override public boolean incrementToken() throws IOException { if (tokens == null) { fillTokens(); } //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); if (upto == tokens.size()) { //System.out.println(" END @ " + tokens.size()); return false; } final Token t = tokens.get(upto++); //System.out.println(" return token=" + t); clearAttributes(); termAtt.append(t.toString()); offsetAtt.setOffset(t.startOffset(), t.endOffset()); posIncrAtt.setPositionIncrement(t.getPositionIncrement()); posLengthAtt.setPositionLength(t.getPositionLength()); return true; } @Override public void end() throws IOException { super.end(); // NOTE: somewhat... hackish, but we need this to // satisfy BTSTC: final int lastOffset; if (tokens != null && !tokens.isEmpty()) { lastOffset = tokens.get(tokens.size()-1).endOffset(); } else { lastOffset = 0; } offsetAtt.setOffset(correctOffset(lastOffset), correctOffset(inputLength)); } private void fillTokens() throws IOException { final StringBuilder sb = new StringBuilder(); final char[] buffer = new char[256]; while (true) { final int count = input.read(buffer); if (count == -1) { break; } sb.append(buffer, 0, count); //System.out.println("got count=" + count); } //System.out.println("fillTokens: " + sb); inputLength = sb.length(); final String[] parts = sb.toString().split(" "); tokens = new ArrayList<>(); int pos = 0; int maxPos = -1; int offset = 0; //System.out.println("again"); for(String part : parts) { final String[] overlapped = part.split("/"); boolean firstAtPos = true; int minPosLength = Integer.MAX_VALUE; for(String part2 : overlapped) { final int colonIndex = part2.indexOf(':'); final String token; final int posLength; if (colonIndex != -1) { token = part2.substring(0, colonIndex); posLength = Integer.parseInt(part2.substring(1+colonIndex)); } else { token = part2; posLength = 1; } maxPos = Math.max(maxPos, pos + posLength); minPosLength = Math.min(minPosLength, posLength); final Token t = new Token(token, offset, offset + 2*posLength - 1); t.setPositionLength(posLength); t.setPositionIncrement(firstAtPos ? 1:0); firstAtPos = false; //System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset()); tokens.add(t); } pos += minPosLength; offset = 2 * pos; } assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end"; } } public void testMockGraphTokenFilterBasic() throws Exception { for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t2 = new MockGraphTokenFilter(random(), t); return new TokenStreamComponents(t, t2); } }; checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k"); } } public void testMockGraphTokenFilterOnGraphInput() throws Exception { for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new GraphTokenizer(); final TokenStream t2 = new MockGraphTokenFilter(random(), t); return new TokenStreamComponents(t, t2); } }; checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k"); } } // Just deletes (leaving hole) token 'a': private final static class RemoveATokens extends TokenFilter { private int pendingPosInc; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); public RemoveATokens(TokenStream in) { super(in); } @Override public void reset() throws IOException { super.reset(); pendingPosInc = 0; } @Override public void end() throws IOException { super.end(); posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement()); } @Override public boolean incrementToken() throws IOException { while (true) { final boolean gotOne = input.incrementToken(); if (!gotOne) { return false; } else if (termAtt.toString().equals("a")) { pendingPosInc += posIncAtt.getPositionIncrement(); } else { posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement()); pendingPosInc = 0; return true; } } } } public void testMockGraphTokenFilterBeforeHoles() throws Exception { for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t2 = new MockGraphTokenFilter(random(), t); final TokenStream t3 = new RemoveATokens(t2); return new TokenStreamComponents(t, t3); } }; Random random = random(); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k"); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a"); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y"); } } public void testMockGraphTokenFilterAfterHoles() throws Exception { for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t2 = new RemoveATokens(t); final TokenStream t3 = new MockGraphTokenFilter(random(), t2); return new TokenStreamComponents(t, t3); } }; Random random = random(); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k"); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a"); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y"); } } public void testMockGraphTokenFilterRandom() throws Exception { for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t2 = new MockGraphTokenFilter(random(), t); return new TokenStreamComponents(t, t2); } }; Random random = random(); checkRandomData(random, a, 5, atLeast(100)); } } // Two MockGraphTokenFilters public void testDoubleMockGraphTokenFilterRandom() throws Exception { for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t1 = new MockGraphTokenFilter(random(), t); final TokenStream t2 = new MockGraphTokenFilter(random(), t1); return new TokenStreamComponents(t, t2); } }; Random random = random(); checkRandomData(random, a, 5, atLeast(100)); } } public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception { for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t1 = new MockGraphTokenFilter(random(), t); final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1); return new TokenStreamComponents(t, t2); } }; Random random = random(); checkRandomData(random, a, 5, atLeast(100)); } } public void testMockGraphTokenFilterAfterHolesRandom() throws Exception { for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t); final TokenStream t2 = new MockGraphTokenFilter(random(), t1); return new TokenStreamComponents(t, t2); } }; Random random = random(); checkRandomData(random, a, 5, atLeast(100)); } } private static Token token(String term, int posInc, int posLength) { final Token t = new Token(term, 0, 0); t.setPositionIncrement(posInc); t.setPositionLength(posLength); return t; } private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { final Token t = new Token(term, startOffset, endOffset); t.setPositionIncrement(posInc); t.setPositionLength(posLength); return t; } public void testSingleToken() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), }); assertSameLanguage(s2a("abc"), ts); } public void testMultipleHoles() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("b", 3, 1), }); assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts); } public void testSynOverMultipleHoles() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("x", 0, 3), token("b", 3, 1), }); final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); assertSameLanguage(Operations.union(a1, a2), ts); } // for debugging! /* private static void toDot(Automaton a) throws IOException { final String s = a.toDot(); Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); w.write(s); w.close(); System.out.println("TEST: saved to /x/tmp/out.dot"); } */ private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP); private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE); private Automaton join(String ... strings) { List<Automaton> as = new ArrayList<>(); for(String s : strings) { as.add(s2a(s)); as.add(SEP_A); } as.remove(as.size()-1); return Operations.concatenate(as); } private Automaton join(Automaton ... as) { return Operations.concatenate(Arrays.asList(as)); } private Automaton s2a(String s) { return Automata.makeString(s); } public void testTwoTokens() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("def", 1, 1), }); assertSameLanguage(join("abc", "def"), ts); } public void testHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("def", 2, 1), }); assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts); } public void testOverlappedTokensSausage() throws Exception { // Two tokens on top of each other (sausage): final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 1) }); final Automaton a1 = s2a("abc"); final Automaton a2 = s2a("xyz"); assertSameLanguage(Operations.union(a1, a2), ts); } public void testOverlappedTokensLattice() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 2), token("def", 1, 1), }); final Automaton a1 = s2a("xyz"); final Automaton a2 = join("abc", "def"); assertSameLanguage(Operations.union(a1, a2), ts); } public void testSynOverHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 2), token("b", 2, 1), }); final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X")); final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b"))); assertSameLanguage(expected, ts); } public void testSynOverHole2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("xyz", 1, 1), token("abc", 0, 3), token("def", 2, 1), }); final Automaton expected = Operations.union( join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc")); assertSameLanguage(expected, ts); } public void testOverlappedTokensLattice2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 3), token("def", 1, 1), token("ghi", 1, 1), }); final Automaton a1 = s2a("xyz"); final Automaton a2 = join("abc", "def", "ghi"); assertSameLanguage(Operations.union(a1, a2), ts); } public void testToDot() throws Exception { final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)}); StringWriter w = new StringWriter(); new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot(); assertTrue(w.toString().indexOf("abc / abcd") != -1); } public void testStartsWithHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 2, 1), }); assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts); } public void testEndsWithHole() throws Exception { final TokenStream ts = new CannedTokenStream(1, 0, new Token[] { token("abc", 2, 1), }); assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc"), SEP_A, HOLE_A), ts); } public void testSynHangingOverEnd() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 10), }); assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts); } /** Returns all paths */ private Set<String> toPathStrings(Automaton a) { BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder(); Set<String> paths = new HashSet<>(); for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) { paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' ')); } return paths; } private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException { assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts)); } private void assertSameLanguage(Automaton expected, Automaton actual) { Automaton expectedDet = Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES); Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES); if (Operations.sameLanguage(expectedDet, actualDet) == false) { Set<String> expectedPaths = toPathStrings(expectedDet); Set<String> actualPaths = toPathStrings(actualDet); StringBuilder b = new StringBuilder(); b.append("expected:\n"); for(String path : expectedPaths) { b.append(" "); b.append(path); if (actualPaths.contains(path) == false) { b.append(" [missing!]"); } b.append('\n'); } b.append("actual:\n"); for(String path : actualPaths) { b.append(" "); b.append(path); if (expectedPaths.contains(path) == false) { b.append(" [unexpected!]"); } b.append('\n'); } fail("accepted language is different:\n\n" + b.toString()); } } public void testTokenStreamGraphWithHoles() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 1, 8), token("def", 1, 1), token("ghi", 1, 1), }); assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")), join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts); } }