TestGraphTokenizers.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.fst.Util;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;

public class TestGraphTokenizers extends BaseTokenStreamTestCase {

  // Makes a graph TokenStream from the string; separate
  // positions with single space, multiple tokens at the same
  // position with /, and add optional position length with
  // :.  EG "a b c" is a simple chain, "a/x b c" adds 'x'
  // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
  // 'x' over a with posLen=3.  Tokens are in normal-form!
  // So, offsets are computed based on the first token at a
  // given position.  NOTE: each token must be a single
  // character!  We assume this when computing offsets...
  
  // NOTE: all input tokens must be length 1!!!  This means
  // you cannot turn on MockCharFilter when random
  // testing...

  private static class GraphTokenizer extends Tokenizer {
    private List<Token> tokens;
    private int upto;
    private int inputLength;

    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);

    @Override
    public void reset() throws IOException {
      super.reset();
      tokens = null;
      upto = 0;
    }

    @Override
    public boolean incrementToken() throws IOException {
      if (tokens == null) {
        fillTokens();
      }
      //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
      if (upto == tokens.size()) {
        //System.out.println("  END @ " + tokens.size());
        return false;
      } 
      final Token t = tokens.get(upto++);
      //System.out.println("  return token=" + t);
      clearAttributes();
      termAtt.append(t.toString());
      offsetAtt.setOffset(t.startOffset(), t.endOffset());
      posIncrAtt.setPositionIncrement(t.getPositionIncrement());
      posLengthAtt.setPositionLength(t.getPositionLength());
      return true;
    }

    @Override
    public void end() throws IOException {
      super.end();
      // NOTE: somewhat... hackish, but we need this to
      // satisfy BTSTC:
      final int lastOffset;
      if (tokens != null && !tokens.isEmpty()) {
        lastOffset = tokens.get(tokens.size()-1).endOffset();
      } else {
        lastOffset = 0;
      }
      offsetAtt.setOffset(correctOffset(lastOffset),
                          correctOffset(inputLength));
    }

    private void fillTokens() throws IOException {
      final StringBuilder sb = new StringBuilder();
      final char[] buffer = new char[256];
      while (true) {
        final int count = input.read(buffer);
        if (count == -1) {
          break;
        }
        sb.append(buffer, 0, count);
        //System.out.println("got count=" + count);
      }
      //System.out.println("fillTokens: " + sb);

      inputLength = sb.length();

      final String[] parts = sb.toString().split(" ");

      tokens = new ArrayList<>();
      int pos = 0;
      int maxPos = -1;
      int offset = 0;
      //System.out.println("again");
      for(String part : parts) {
        final String[] overlapped = part.split("/");
        boolean firstAtPos = true;
        int minPosLength = Integer.MAX_VALUE;
        for(String part2 : overlapped) {
          final int colonIndex = part2.indexOf(':');
          final String token;
          final int posLength;
          if (colonIndex != -1) {
            token = part2.substring(0, colonIndex);
            posLength = Integer.parseInt(part2.substring(1+colonIndex));
          } else {
            token = part2;
            posLength = 1;
          }
          maxPos = Math.max(maxPos, pos + posLength);
          minPosLength = Math.min(minPosLength, posLength);
          final Token t = new Token(token, offset, offset + 2*posLength - 1);
          t.setPositionLength(posLength);
          t.setPositionIncrement(firstAtPos ? 1:0);
          firstAtPos = false;
          //System.out.println("  add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
          tokens.add(t);
        }
        pos += minPosLength;
        offset = 2 * pos;
      }
      assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
    }
  }

  public void testMockGraphTokenFilterBasic() throws Exception {

    for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }

      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
            return new TokenStreamComponents(t, t2);
          }
        };
      
      checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
    }
  }

  public void testMockGraphTokenFilterOnGraphInput() throws Exception {
    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }

      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer t = new GraphTokenizer();
            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
            return new TokenStreamComponents(t, t2);
          }
        };
      
      checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
    }
  }

  // Just deletes (leaving hole) token 'a':
  private final static class RemoveATokens extends TokenFilter {
    private int pendingPosInc;

    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

    public RemoveATokens(TokenStream in) {
      super(in);
    }

    @Override
    public void reset() throws IOException {
      super.reset();
      pendingPosInc = 0;
    }

    @Override
    public void end() throws IOException {
      super.end();
      posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
    }

    @Override
    public boolean incrementToken() throws IOException {
      while (true) {
        final boolean gotOne = input.incrementToken();
        if (!gotOne) {
          return false;
        } else if (termAtt.toString().equals("a")) {
          pendingPosInc += posIncAtt.getPositionIncrement();
        } else {
          posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
          pendingPosInc = 0;
          return true;
        }
      }
    }
  }

  public void testMockGraphTokenFilterBeforeHoles() throws Exception {
    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }

      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
            final TokenStream t3 = new RemoveATokens(t2);
            return new TokenStreamComponents(t, t3);
          }
        };

      Random random = random();
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
      checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
    }
  }

  public void testMockGraphTokenFilterAfterHoles() throws Exception {
    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }

      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            final TokenStream t2 = new RemoveATokens(t);
            final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
            return new TokenStreamComponents(t, t3);
          }
        };

      Random random = random();
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
      checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
    }
  }

  public void testMockGraphTokenFilterRandom() throws Exception {
    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }

      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
            return new TokenStreamComponents(t, t2);
          }
        };
      
      Random random = random();
      checkRandomData(random, a, 5, atLeast(100));
    }
  }

  // Two MockGraphTokenFilters
  public void testDoubleMockGraphTokenFilterRandom() throws Exception {
    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }

      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            final TokenStream t1 = new MockGraphTokenFilter(random(), t);
            final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
            return new TokenStreamComponents(t, t2);
          }
        };
      
      Random random = random();
      checkRandomData(random, a, 5, atLeast(100));
    }
  }

  public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }

      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            final TokenStream t1 = new MockGraphTokenFilter(random(), t);
            final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
            return new TokenStreamComponents(t, t2);
          }
        };
      
      Random random = random();
      checkRandomData(random, a, 5, atLeast(100));
    }
  }

  public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }

      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
            final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
            return new TokenStreamComponents(t, t2);
          }
        };
      
      Random random = random();
      checkRandomData(random, a, 5, atLeast(100));
    }
  }

  private static Token token(String term, int posInc, int posLength) {
    final Token t = new Token(term, 0, 0);
    t.setPositionIncrement(posInc);
    t.setPositionLength(posLength);
    return t;
  }

  private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
    final Token t = new Token(term, startOffset, endOffset);
    t.setPositionIncrement(posInc);
    t.setPositionLength(posLength);
    return t;
  }

  public void testSingleToken() throws Exception {
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
      });
    assertSameLanguage(s2a("abc"), ts);
  }

  public void testMultipleHoles() throws Exception {
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("b", 3, 1),
      });
    assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
  }

  public void testSynOverMultipleHoles() throws Exception {
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("x", 0, 3),
        token("b", 3, 1),
      });
    final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
    final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
    assertSameLanguage(Operations.union(a1, a2), ts);
  }

  // for debugging!
  /*
  private static void toDot(Automaton a) throws IOException {
    final String s = a.toDot();
    Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
    w.write(s);
    w.close();
    System.out.println("TEST: saved to /x/tmp/out.dot");
  }
  */

  private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP);
  private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE);

  private Automaton join(String ... strings) {
    List<Automaton> as = new ArrayList<>();
    for(String s : strings) {
      as.add(s2a(s));
      as.add(SEP_A);
    }
    as.remove(as.size()-1);
    return Operations.concatenate(as);
  }

  private Automaton join(Automaton ... as) {
    return Operations.concatenate(Arrays.asList(as));
  }

  private Automaton s2a(String s) {
    return Automata.makeString(s);
  }

  public void testTwoTokens() throws Exception {
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("def", 1, 1),
      });
    assertSameLanguage(join("abc", "def"), ts);
  }

  public void testHole() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("def", 2, 1),
      });
    assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
  }

  public void testOverlappedTokensSausage() throws Exception {

    // Two tokens on top of each other (sausage):
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 1)
      });
    final Automaton a1 = s2a("abc");
    final Automaton a2 = s2a("xyz");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }

  public void testOverlappedTokensLattice() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 2),
        token("def", 1, 1),
      });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }

  public void testSynOverHole() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("X", 0, 2),
        token("b", 2, 1),
      });
    final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
    final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
    assertSameLanguage(expected, ts);
  }

  public void testSynOverHole2() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("xyz", 1, 1),
        token("abc", 0, 3),
        token("def", 2, 1),
      });
    final Automaton expected = Operations.union(
      join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
    assertSameLanguage(expected, ts);
  }

  public void testOverlappedTokensLattice2() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 3),
        token("def", 1, 1),
        token("ghi", 1, 1),
      });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def", "ghi");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }

  public void testToDot() throws Exception {
    final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
    StringWriter w = new StringWriter();
    new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
    assertTrue(w.toString().indexOf("abc / abcd") != -1);
  }

  public void testStartsWithHole() throws Exception {
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 2, 1),
      });
    assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
  }

  public void testEndsWithHole() throws Exception {
    final TokenStream ts = new CannedTokenStream(1, 0,
                                                 new Token[] {
                                                   token("abc", 2, 1),
                                                 });
    assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc"), SEP_A, HOLE_A), ts);
  }

  public void testSynHangingOverEnd() throws Exception {
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("X", 0, 10),
      });
    assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
  }

  /** Returns all paths */
  private Set<String> toPathStrings(Automaton a) {
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> paths = new HashSet<>();
    for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
      paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    return paths;
  }

  private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
    assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
  }

  private void assertSameLanguage(Automaton expected, Automaton actual) {
    Automaton expectedDet = Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES);
    Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES);
    if (Operations.sameLanguage(expectedDet, actualDet) == false) {
      Set<String> expectedPaths = toPathStrings(expectedDet);
      Set<String> actualPaths = toPathStrings(actualDet);
      StringBuilder b = new StringBuilder();
      b.append("expected:\n");
      for(String path : expectedPaths) {
        b.append("  ");
        b.append(path);
        if (actualPaths.contains(path) == false) {
          b.append(" [missing!]");
        }
        b.append('\n');
      }
      b.append("actual:\n");
      for(String path : actualPaths) {
        b.append("  ");
        b.append(path);
        if (expectedPaths.contains(path) == false) {
          b.append(" [unexpected!]");
        }
        b.append('\n');
      }
      fail("accepted language is different:\n\n" + b.toString());
    }
  }

  public void testTokenStreamGraphWithHoles() throws Exception {
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 1, 8),
        token("def", 1, 1),
        token("ghi", 1, 1),
      });
    assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
                                        join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
  }
}