BaseTokenStreamTestCase.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.CountDownLatch;

import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.fst.Util;

/** 
 * Base class for all Lucene unit tests that use TokenStreams. 
 * <p>
 * When writing unit tests for analysis components, it's highly recommended
 * to use the helper methods here (especially in conjunction with {@link MockAnalyzer} or
 * {@link MockTokenizer}), as they contain many assertions and checks to 
 * catch bugs.
 * 
 * @see MockAnalyzer
 * @see MockTokenizer
 */
public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
  // some helpers to test Analyzers and TokenStreams:
  
  /**
   * Attribute that records if it was cleared or not.  This is used 
   * for testing that clearAttributes() was called correctly.
   */
  public static interface CheckClearAttributesAttribute extends Attribute {
    boolean getAndResetClearCalled();
  }

  /**
   * Attribute that records if it was cleared or not.  This is used 
   * for testing that clearAttributes() was called correctly.
   */
  public static final class CheckClearAttributesAttributeImpl extends AttributeImpl implements CheckClearAttributesAttribute {
    private boolean clearCalled = false;
    
    @Override
    public boolean getAndResetClearCalled() {
      try {
        return clearCalled;
      } finally {
        clearCalled = false;
      }
    }

    @Override
    public void clear() {
      clearCalled = true;
    }

    @Override
    public boolean equals(Object other) {
      return (
        other instanceof CheckClearAttributesAttributeImpl &&
        ((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled
      );
    }

    @Override
    public int hashCode() {
      return 76137213 ^ Boolean.valueOf(clearCalled).hashCode();
    }
    
    @Override
    public void copyTo(AttributeImpl target) {
      ((CheckClearAttributesAttributeImpl) target).clear();
    }

    @Override
    public void reflectWith(AttributeReflector reflector) {
      reflector.reflect(CheckClearAttributesAttribute.class, "clearCalled", clearCalled);
    }
  }

  // offsetsAreCorrect also validates:
  //   - graph offsets are correct (all tokens leaving from
  //     pos X have the same startOffset; all tokens
  //     arriving to pos Y have the same endOffset)
  //   - offsets only move forwards (startOffset >=
  //     lastStartOffset)
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
                                               int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
                                               boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
    
    CharTermAttribute termAtt = null;
    if (output.length > 0) {
      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
      termAtt = ts.getAttribute(CharTermAttribute.class);
    }
    
    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
      assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
      offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }
    
    TypeAttribute typeAtt = null;
    if (types != null) {
      assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
      typeAtt = ts.getAttribute(TypeAttribute.class);
    }
    
    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
      assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
      posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
      assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
      posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
      assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
      keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }
    
    // Maps position to the start/end offset:
    final Map<Integer,Integer> posToStartOffset = new HashMap<>();
    final Map<Integer,Integer> posToEndOffset = new HashMap<>();

    // TODO: would be nice to be able to assert silly duplicated tokens are not created, but a number of cases do this "legitimately": LUCENE-7622

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
      // extra safety to enforce, that the state is not preserved and also assign bogus values
      ts.clearAttributes();
      termAtt.setEmpty().append("bogusTerm");
      if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
      if (typeAtt != null) typeAtt.setType("bogusType");
      if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
      if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
      if (keywordAtt != null) keywordAtt.setKeyword((i&1) == 0);
      
      checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
      assertTrue("token "+i+" does not exist", ts.incrementToken());
      assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());

      assertEquals("term "+i, output[i], termAtt.toString());
      if (startOffsets != null) {
        assertEquals("startOffset " + i + " term=" + termAtt, startOffsets[i], offsetAtt.startOffset());
      }
      if (endOffsets != null) {
        assertEquals("endOffset " + i + " term=" + termAtt, endOffsets[i], offsetAtt.endOffset());
      }
      if (types != null) {
        assertEquals("type " + i + " term=" + termAtt, types[i], typeAtt.type());
      }
      if (posIncrements != null) {
        assertEquals("posIncrement " + i + " term=" + termAtt, posIncrements[i], posIncrAtt.getPositionIncrement());
      }
      if (posLengths != null) {
        assertEquals("posLength " + i + " term=" + termAtt, posLengths[i], posLengthAtt.getPositionLength());
      }
      if (keywordAtts != null) {
        assertEquals("keywordAtt " + i + " term=" + termAtt, keywordAtts[i], keywordAtt.isKeyword());
      }
      
      // we can enforce some basic things about a few attributes even if the caller doesn't check:
      if (offsetAtt != null) {
        final int startOffset = offsetAtt.startOffset();
        final int endOffset = offsetAtt.endOffset();
        if (finalOffset != null) {
          assertTrue("startOffset (= " + startOffset + ") must be <= finalOffset (= " + finalOffset + ") term=" + termAtt, startOffset <= finalOffset.intValue());
          assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue() + " term=" + termAtt,
                     endOffset <= finalOffset.intValue());
        }

        if (offsetsAreCorrect) {
          assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
          lastStartOffset = offsetAtt.startOffset();
        }

        if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
          // Validate offset consistency in the graph, ie
          // all tokens leaving from a certain pos have the
          // same startOffset, and all tokens arriving to a
          // certain pos have the same endOffset:
          final int posInc = posIncrAtt.getPositionIncrement();
          pos += posInc;

          final int posLength = posLengthAtt.getPositionLength();

          if (!posToStartOffset.containsKey(pos)) {
            // First time we've seen a token leaving from this position:
            posToStartOffset.put(pos, startOffset);
            //System.out.println("  + s " + pos + " -> " + startOffset);
          } else {
            // We've seen a token leaving from this position
            // before; verify the startOffset is the same:
            //System.out.println("  + vs " + pos + " -> " + startOffset);
            assertEquals(i + " inconsistent startOffset: pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset);
          }

          final int endPos = pos + posLength;

          if (!posToEndOffset.containsKey(endPos)) {
            // First time we've seen a token arriving to this position:
            posToEndOffset.put(endPos, endOffset);
            //System.out.println("  + e " + endPos + " -> " + endOffset);
          } else {
            // We've seen a token arriving to this position
            // before; verify the endOffset is the same:
            //System.out.println("  + ve " + endPos + " -> " + endOffset);
            assertEquals("inconsistent endOffset " + i + " pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset);
          }
        }
      }
      if (posIncrAtt != null) {
        if (i == 0) {
          assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
        } else {
          assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
        }
      }
      if (posLengthAtt != null) {
        assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
      }
    }

    if (ts.incrementToken()) {
      fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + ts.getAttribute(CharTermAttribute.class));
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null) termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
    if (typeAtt != null) typeAtt.setType("bogusType");
    if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
    
    checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled());
    
    if (finalOffset != null) {
      assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
      assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
      assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
                                               int posLengths[], Integer finalOffset, boolean[] keywordAtts,
                                               boolean offsetsAreCorrect) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, null, offsetsAreCorrect);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
    assertTokenStreamContents(ts, output, null, null, null, null, null, null);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
    assertTokenStreamContents(ts, output, null, null, types, null, null, null);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
    assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input);
    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input);
    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
  }

  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input, offsetsAreCorrect);
    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, null, null);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, types, null, null);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
  }

  public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
  }

  public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, String[] types, int[] posIncrements, int[] posLengths) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, types, posIncrements, posLengths);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
  }

  static void checkResetException(Analyzer a, String input) throws IOException {
    TokenStream ts = a.tokenStream("bogus", input);
    try {
      if (ts.incrementToken()) {
        //System.out.println(ts.reflectAsString(false));
        fail("didn't get expected exception when reset() not called");
      }
    } catch (IllegalStateException expected) {
      // ok
    } catch (Exception unexpected) {
      unexpected.printStackTrace(System.err);
      fail("got wrong exception when reset() not called: " + unexpected);
    } finally {
      // consume correctly
      ts.reset();
      while (ts.incrementToken()) {}
      ts.end();
      ts.close();
    }
    
    // check for a missing close()
    ts = a.tokenStream("bogus", input);
    ts.reset();
    while (ts.incrementToken()) {}
    ts.end();
    try {
      ts = a.tokenStream("bogus", input);
      fail("didn't get expected exception when close() not called");
    } catch (IllegalStateException expected) {
      // ok
    } finally {
      ts.close();
    }
  }

  // simple utility method for testing stemmers
  
  public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException {
    assertAnalyzesTo(a, input, new String[]{expected});
  }
  
  /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
  public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
    checkRandomData(random, a, iterations, 20, false, true);
  }

  /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
    checkRandomData(random, a, iterations, maxWordLength, false, true);
  }
  
  /** 
   * utility method for blasting tokenstreams with data to make sure they don't do anything crazy 
   * @param simple true if only ascii strings will be used (try to avoid)
   */
  public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
    checkRandomData(random, a, iterations, 20, simple, true);
  }
  
  /** Asserts that the given stream has expected number of tokens. */
  public static void assertStreamHasNumberOfTokens(TokenStream ts, int expectedCount) throws IOException {
    ts.reset();
    int count = 0;
    while (ts.incrementToken()) {
      count++;
    }
    ts.end();
    assertEquals("wrong number of tokens", expectedCount, count);
  }
  
  static class AnalysisThread extends Thread {
    final int iterations;
    final int maxWordLength;
    final long seed;
    final Analyzer a;
    final boolean useCharFilter;
    final boolean simple;
    final boolean offsetsAreCorrect;
    final RandomIndexWriter iw;
    final CountDownLatch latch;

    // NOTE: not volatile because we don't want the tests to
    // add memory barriers (ie alter how threads
    // interact)... so this is just "best effort":
    public boolean failed;
    
    AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) {
      this.seed = seed;
      this.a = a;
      this.iterations = iterations;
      this.maxWordLength = maxWordLength;
      this.useCharFilter = useCharFilter;
      this.simple = simple;
      this.offsetsAreCorrect = offsetsAreCorrect;
      this.iw = iw;
      this.latch = latch;
    }
    
    @Override
    public void run() {
      boolean success = false;
      try {
        latch.await();
        // see the part in checkRandomData where it replays the same text again
        // to verify reproducability/reuse: hopefully this would catch thread hazards.
        checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
        success = true;
      } catch (Exception e) {
        Rethrow.rethrow(e);
      } finally {
        failed = !success;
      }
    }
  };
  
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
    checkRandomData(random, a, iterations, maxWordLength, simple, true);
  }

  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
    checkResetException(a, "best effort");
    long seed = random.nextLong();
    boolean useCharFilter = random.nextBoolean();
    Directory dir = null;
    RandomIndexWriter iw = null;
    final String postingsFormat =  TestUtil.getPostingsFormat("dummy");
    boolean codecOk = iterations * maxWordLength < 100000 ||
        !(postingsFormat.equals("Memory") ||
            postingsFormat.equals("SimpleText"));
    if (rarely(random) && codecOk) {
      dir = newFSDirectory(createTempDir("bttc"));
      iw = new RandomIndexWriter(new Random(seed), dir, a);
    }
    boolean success = false;
    try {
      checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
      // now test with multiple threads: note we do the EXACT same thing we did before in each thread,
      // so this should only really fail from another thread if it's an actual thread problem
      int numThreads = TestUtil.nextInt(random, 2, 4);
      final CountDownLatch startingGun = new CountDownLatch(1);
      AnalysisThread threads[] = new AnalysisThread[numThreads];
      for (int i = 0; i < threads.length; i++) {
        threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
      }
      for (int i = 0; i < threads.length; i++) {
        threads[i].start();
      }
      startingGun.countDown();
      for (int i = 0; i < threads.length; i++) {
        try {
          threads[i].join();
        } catch (InterruptedException e) {
          throw new RuntimeException(e);
        }
      }
      for (int i = 0; i < threads.length; i++) {
        if (threads[i].failed) {
          throw new RuntimeException("some thread(s) failed");
        }
      }
      if (iw != null) {
        iw.close();
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(dir);
      } else {
        IOUtils.closeWhileHandlingException(dir); // checkindex
      }
    }
  }

  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {

    final LineFileDocs docs = new LineFileDocs(random);
    Document doc = null;
    Field field = null, currentField = null;
    StringReader bogus = new StringReader("");
    if (iw != null) {
      doc = new Document();
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      if (random.nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(random.nextBoolean());
        ft.setStoreTermVectorPositions(random.nextBoolean());
        if (ft.storeTermVectorPositions()) {
          ft.setStoreTermVectorPayloads(random.nextBoolean());
        }
      }
      if (random.nextBoolean()) {
        ft.setOmitNorms(true);
      }
      switch(random.nextInt(4)) {
        case 0: ft.setIndexOptions(IndexOptions.DOCS); break;
        case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
        case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
        default:
          if (offsetsAreCorrect) {
            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
          } else {
            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
          }
      }
      currentField = field = new Field("dummy", bogus, ft);
      doc.add(currentField);
    }
    
    try {
      for (int i = 0; i < iterations; i++) {
        String text;

        if (random.nextInt(10) == 7) {
          // real data from linedocs
          text = docs.nextDoc().get("body");
          if (text.length() > maxWordLength) {
            
            // Take a random slice from the text...:
            int startPos = random.nextInt(text.length() - maxWordLength);
            if (startPos > 0 && Character.isLowSurrogate(text.charAt(startPos))) {
              // Take care not to split up a surrogate pair:
              startPos--;
              assert Character.isHighSurrogate(text.charAt(startPos));
            }
            int endPos = startPos + maxWordLength - 1;
            if (Character.isHighSurrogate(text.charAt(endPos))) {
              // Take care not to split up a surrogate pair:
              endPos--;
            }
            text = text.substring(startPos, 1+endPos);
          }
        } else {
          // synthetic
          text = TestUtil.randomAnalysisString(random, maxWordLength, simple);
        }

        try {
          checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
          if (iw != null) {
            if (random.nextInt(7) == 0) {
              // pile up a multivalued field
              IndexableFieldType ft = field.fieldType();
              currentField = new Field("dummy", bogus, ft);
              doc.add(currentField);
            } else {
              iw.addDocument(doc);
              if (doc.getFields().size() > 1) {
                // back to 1 field
                currentField = field;
                doc.removeFields("dummy");
                doc.add(currentField);
              }
            }
          }
        } catch (Throwable t) {
          // TODO: really we should pass a random seed to
          // checkAnalysisConsistency then print it here too:
          System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + escape(text) + "'");
          Rethrow.rethrow(t);
        }
      }
    } finally {
      IOUtils.closeWhileHandlingException(docs);
    }
  }

  public static String escape(String s) {
    int charUpto = 0;
    final StringBuilder sb = new StringBuilder();
    while (charUpto < s.length()) {
      final int c = s.charAt(charUpto);
      if (c == 0xa) {
        // Strangely, you cannot put \ u000A into Java
        // sources (not in a comment nor a string
        // constant)...:
        sb.append("\\n");
      } else if (c == 0xd) {
        // ... nor \ u000D:
        sb.append("\\r");
      } else if (c == '"') {
        sb.append("\\\"");
      } else if (c == '\\') {
        sb.append("\\\\");
      } else if (c >= 0x20 && c < 0x80) {
        sb.append((char) c);
      } else {
        // TODO: we can make ascii easier to read if we
        // don't escape...
        sb.append(String.format(Locale.ROOT, "\\u%04x", c));
      }
      charUpto++;
    }
    return sb.toString();
  }

  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
    checkAnalysisConsistency(random, a, useCharFilter, text, true);
  }

  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
    checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, null);
  }
  
  private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect, Field field) throws IOException {

    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
    }

    int remainder = random.nextInt(10);
    Reader reader = new StringReader(text);
    TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
    List<String> tokens = new ArrayList<>();
    List<String> types = new ArrayList<>();
    List<Integer> positions = new ArrayList<>();
    List<Integer> positionLengths = new ArrayList<>();
    List<Integer> startOffsets = new ArrayList<>();
    List<Integer> endOffsets = new ArrayList<>();
    ts.reset();

    // First pass: save away "correct" tokens
    while (ts.incrementToken()) {
      assertNotNull("has no CharTermAttribute", termAtt);
      tokens.add(termAtt.toString());
      if (typeAtt != null) types.add(typeAtt.type());
      if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
      if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
      if (offsetAtt != null) {
        startOffsets.add(offsetAtt.startOffset());
        endOffsets.add(offsetAtt.endOffset());
      }
    }
    ts.end();
    ts.close();

    // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
    if (!tokens.isEmpty()) {

      // KWTokenizer (for example) can produce a token
      // even when input is length 0:
      if (text.length() != 0) {

        // (Optional) second pass: do something evil:
        final int evilness = random.nextInt(50);
        if (evilness == 17) {
          if (VERBOSE) {
            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
          }
          // Throw an errant exception from the Reader:

          MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
          evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
          reader = evilReader;

          try {
            // NOTE: some Tokenizers go and read characters
            // when you call .setReader(Reader), eg
            // PatternTokenizer.  This is a bit
            // iffy... (really, they should only
            // pull from the Reader when you call
            // .incremenToken(), I think?), but we
            // currently allow it, so, we must call
            // a.tokenStream inside the try since we may
            // hit the exc on init:
            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
            ts.reset();
            while (ts.incrementToken());
            fail("did not hit exception");
          } catch (RuntimeException re) {
            assertTrue(MockReaderWrapper.isMyEvilException(re));
          }
          try {
            ts.end();
          } catch (IllegalStateException ise) {
            // Catch & ignore MockTokenizer's
            // anger...
            if (ise.getMessage().contains("end() called in wrong state=")) {
              // OK
            } else {
              throw ise;
            }
          }
          ts.close();
        } else if (evilness == 7) {
          // Only consume a subset of the tokens:
          final int numTokensToRead = random.nextInt(tokens.size());
          if (VERBOSE) {
            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
          }

          reader = new StringReader(text);
          ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
          ts.reset();
          for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
            assertTrue(ts.incrementToken());
          }
          try {
            ts.end();
          } catch (IllegalStateException ise) {
            // Catch & ignore MockTokenizer's
            // anger...
            if (ise.getMessage().contains("end() called in wrong state=")) {
              // OK
            } else {
              throw ise;
            }
          }
          ts.close();
        }
      }
    }

    // Final pass: verify clean tokenization matches
    // results from first pass:

    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
    }
    reader = new StringReader(text);

    long seed = random.nextLong();
    random = new Random(seed);
    if (random.nextInt(30) == 7) {
      if (VERBOSE) {
        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
      }

      reader = new MockReaderWrapper(random, reader);
    }

    ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
    if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
      // offset + pos + posLength + type
      assertTokenStreamContents(ts, 
                                tokens.toArray(new String[tokens.size()]),
                                toIntArray(startOffsets),
                                toIntArray(endOffsets),
                                types.toArray(new String[types.size()]),
                                toIntArray(positions),
                                toIntArray(positionLengths),
                                text.length(),
                                offsetsAreCorrect);
    } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
      // offset + pos + type
      assertTokenStreamContents(ts, 
                                tokens.toArray(new String[tokens.size()]),
                                toIntArray(startOffsets),
                                toIntArray(endOffsets),
                                types.toArray(new String[types.size()]),
                                toIntArray(positions),
                                null,
                                text.length(),
                                offsetsAreCorrect);
    } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
      // offset + pos + posLength
      assertTokenStreamContents(ts, 
                                tokens.toArray(new String[tokens.size()]),
                                toIntArray(startOffsets),
                                toIntArray(endOffsets),
                                null,
                                toIntArray(positions),
                                toIntArray(positionLengths),
                                text.length(),
                                offsetsAreCorrect);
    } else if (posIncAtt != null && offsetAtt != null) {
      // offset + pos
      assertTokenStreamContents(ts, 
                                tokens.toArray(new String[tokens.size()]),
                                toIntArray(startOffsets),
                                toIntArray(endOffsets),
                                null,
                                toIntArray(positions),
                                null,
                                text.length(),
                                offsetsAreCorrect);
    } else if (offsetAtt != null) {
      // offset
      assertTokenStreamContents(ts, 
                                tokens.toArray(new String[tokens.size()]),
                                toIntArray(startOffsets),
                                toIntArray(endOffsets),
                                null,
                                null,
                                null,
                                text.length(),
                                offsetsAreCorrect);
    } else {
      // terms only
      assertTokenStreamContents(ts, 
                                tokens.toArray(new String[tokens.size()]));
    }

    a.normalize("dummy", text);
    // TODO: what can we do besides testing that the above method does not throw?

    if (field != null) {
      reader = new StringReader(text);
      random = new Random(seed);
      if (random.nextInt(30) == 7) {
        if (VERBOSE) {
          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: indexing using spoon-feed reader");
        }

        reader = new MockReaderWrapper(random, reader);
      }

      field.setReaderValue(useCharFilter ? new MockCharFilter(reader, remainder) : reader);
    }
  }

  protected String toDot(Analyzer a, String inputText) throws IOException {
    final StringWriter sw = new StringWriter();
    final TokenStream ts = a.tokenStream("field", inputText);
    ts.reset();
    new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot();
    return sw.toString();
  }

  protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException {
    Writer w = Files.newBufferedWriter(Paths.get(localFileName), StandardCharsets.UTF_8);
    final TokenStream ts = a.tokenStream("field", inputText);
    ts.reset();
    new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot();
    w.close();
  }
  
  static int[] toIntArray(List<Integer> list) {
    int ret[] = new int[list.size()];
    int offset = 0;
    for (Integer i : list) {
      ret[offset++] = i;
    }
    return ret;
  }

  protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    mockTokenizer.setReader(input);
    return mockTokenizer;
  }

  protected static MockTokenizer whitespaceMockTokenizer(String input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    mockTokenizer.setReader(new StringReader(input));
    return mockTokenizer;
  }

  protected static MockTokenizer keywordMockTokenizer(Reader input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
    mockTokenizer.setReader(input);
    return mockTokenizer;
  }

  protected static MockTokenizer keywordMockTokenizer(String input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
    mockTokenizer.setReader(new StringReader(input));
    return mockTokenizer;
  }
  
  /** Returns a random AttributeFactory impl */
  public static AttributeFactory newAttributeFactory(Random random) {
    switch (random.nextInt(3)) {
      case 0:
        return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
      case 1:
        return Token.TOKEN_ATTRIBUTE_FACTORY;
      case 2:
        return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
      default:
        throw new AssertionError("Please fix the Random.nextInt() call above");
    }
  }
  
  /** Returns a random AttributeFactory impl */
  public static AttributeFactory newAttributeFactory() {
    return newAttributeFactory(random());
  }

  private static String toString(Set<String> strings) {
    List<String> stringsList = new ArrayList<>(strings);
    Collections.sort(stringsList);
    StringBuilder b = new StringBuilder();
    for(String s : stringsList) {
      b.append("  ");
      b.append(s);
      b.append('\n');
    }
    return b.toString();
  }

  /**
   * Enumerates all accepted strings in the token graph created by the analyzer on the provided text, and then
   * asserts that it's equal to the expected strings.
   * Uses {@link TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton are all
   * and only the given valid strings.
   * @param analyzer analyzer containing the SynonymFilter under test.
   * @param text text to be analyzed.
   * @param expectedStrings all expected finite strings.
   */
  public static void assertGraphStrings(Analyzer analyzer, String text, String... expectedStrings) throws IOException {
    checkAnalysisConsistency(random(), analyzer, true, text, true);
    try (TokenStream tokenStream = analyzer.tokenStream("dummy", text)) {
      assertGraphStrings(tokenStream, expectedStrings);
    }
  }

  /**
   * Enumerates all accepted strings in the token graph created by the already initialized {@link TokenStream}.
   */
  public static void assertGraphStrings(TokenStream tokenStream, String... expectedStrings) throws IOException {
    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
    Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);

    Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));

    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> actualStrings = new HashSet<>();
    for (IntsRef ir: actualStringPaths) {
      actualStrings.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    for (String s : actualStrings) {
      assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), expectedStringsSet.contains(s));
    }
    for (String s : expectedStrings) {
      assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), actualStrings.contains(s));
    }
  }

  /** Returns all paths accepted by the token stream graph produced by analyzing text with the provided analyzer.  The tokens {@link
   *  CharTermAttribute} values are concatenated, and separated with space. */
  public static Set<String> getGraphStrings(Analyzer analyzer, String text) throws IOException {
    try(TokenStream tokenStream = analyzer.tokenStream("dummy", text)) {
      return getGraphStrings(tokenStream);
    }
  }

  /** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
  public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
    Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> paths = new HashSet<>();
    for (IntsRef ir: actualStringPaths) {
      paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    return paths;
  }

  /** Returns a {@code String} summary of the tokens this analyzer produces on this text */
  public static String toString(Analyzer analyzer, String text) throws IOException {
    try(TokenStream ts = analyzer.tokenStream("field", text)) {
      StringBuilder b = new StringBuilder();
      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
      PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
      OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
      assertNotNull(offsetAtt);
      ts.reset();
      int pos = -1;
      while (ts.incrementToken()) {
        pos += posIncAtt.getPositionIncrement();
        b.append(termAtt);
        b.append(" at pos=");
        b.append(pos);
        if (posLengthAtt != null) {
          b.append(" to pos=");
          b.append(pos + posLengthAtt.getPositionLength());
        }
        b.append(" offsets=");
        b.append(offsetAtt.startOffset());
        b.append('-');
        b.append(offsetAtt.endOffset());
        b.append('\n');
      }
      ts.end();
      return b.toString();
    }
  }
}