package org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.StringReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Random; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; /** * Base class for all Lucene unit tests that use TokenStreams. * <p> * When writing unit tests for analysis components, its highly recommended * to use the helper methods here (especially in conjunction with {@link MockAnalyzer} or * {@link MockTokenizer}), as they contain many assertions and checks to * catch bugs. * * @see MockAnalyzer * @see MockTokenizer */ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { // some helpers to test Analyzers and TokenStreams: public static interface CheckClearAttributesAttribute extends Attribute { boolean getAndResetClearCalled(); } public static final class CheckClearAttributesAttributeImpl extends AttributeImpl implements CheckClearAttributesAttribute { private boolean clearCalled = false; public boolean getAndResetClearCalled() { try { return clearCalled; } finally { clearCalled = false; } } @Override public void clear() { clearCalled = true; } @Override public boolean equals(Object other) { return ( other instanceof CheckClearAttributesAttributeImpl && ((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled ); } @Override public int hashCode() { return 76137213 ^ Boolean.valueOf(clearCalled).hashCode(); } @Override public void copyTo(AttributeImpl target) { ((CheckClearAttributesAttributeImpl) target).clear(); } } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException { assertNotNull(output); CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); offsetAtt = ts.getAttribute(OffsetAttribute.class); } TypeAttribute typeAtt = null; if (types != null) { assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); typeAtt = ts.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); } ts.reset(); for (int i = 0; i < output.length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before assertTrue("token "+i+" does not exist", ts.incrementToken()); assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); assertEquals("term "+i, output[i], termAtt.toString()); if (startOffsets != null) assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset()); if (endOffsets != null) assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset()); if (types != null) assertEquals("type "+i, types[i], typeAtt.type()); if (posIncrements != null) assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement()); // we can enforce some basic things about a few attributes even if the caller doesn't check: if (offsetAtt != null) { assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0); assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0); assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset()); } if (posIncrAtt != null) { assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); } } assertFalse("end of stream", ts.incrementToken()); ts.end(); if (finalOffset != null) assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); if (offsetAtt != null) { assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); } ts.close(); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException { assertTokenStreamContents(ts, output, null, null, null, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException { assertTokenStreamContents(ts, output, null, null, types, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException { assertTokenStreamContents(ts, output, null, null, null, posIncrements, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException { assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException { assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException { assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length()); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException { assertAnalyzesTo(a, input, output, null, null, null, null); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException { assertAnalyzesTo(a, input, output, null, null, types, null); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { assertAnalyzesTo(a, input, output, null, null, null, posIncrements); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements); } public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length()); } public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException { assertAnalyzesToReuse(a, input, output, null, null, null, null); } public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, String[] types) throws IOException { assertAnalyzesToReuse(a, input, output, null, null, types, null); } public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { assertAnalyzesToReuse(a, input, output, null, null, null, posIncrements); } public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, null); } public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, posIncrements); } // simple utility method for testing stemmers public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException { assertAnalyzesTo(a, input, new String[]{expected}); } public static void checkOneTermReuse(Analyzer a, final String input, final String expected) throws IOException { assertAnalyzesToReuse(a, input, new String[]{expected}); } // simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException { checkRandomData(random, a, iterations, 20); } public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException { for (int i = 0; i < iterations; i++) { String text; switch(_TestUtil.nextInt(random, 0, 3)) { case 0: text = _TestUtil.randomSimpleString(random); break; case 1: text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength); break; default: text = _TestUtil.randomUnicodeString(random, maxWordLength); } if (VERBOSE) { System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); } TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text)); assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null; PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null; TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null; List<String> tokens = new ArrayList<String>(); List<String> types = new ArrayList<String>(); List<Integer> positions = new ArrayList<Integer>(); List<Integer> startOffsets = new ArrayList<Integer>(); List<Integer> endOffsets = new ArrayList<Integer>(); ts.reset(); while (ts.incrementToken()) { tokens.add(termAtt.toString()); if (typeAtt != null) types.add(typeAtt.type()); if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement()); if (offsetAtt != null) { startOffsets.add(offsetAtt.startOffset()); endOffsets.add(offsetAtt.endOffset()); } } ts.end(); ts.close(); // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (!tokens.isEmpty()) { if (VERBOSE) { System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis"); } if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type assertAnalyzesToReuse(a, text, tokens.toArray(new String[tokens.size()]), toIntArray(startOffsets), toIntArray(endOffsets), types.toArray(new String[types.size()]), toIntArray(positions)); } else if (posIncAtt != null && offsetAtt != null) { // offset + pos assertAnalyzesToReuse(a, text, tokens.toArray(new String[tokens.size()]), toIntArray(startOffsets), toIntArray(endOffsets), toIntArray(positions)); } else if (offsetAtt != null) { // offset assertAnalyzesToReuse(a, text, tokens.toArray(new String[tokens.size()]), toIntArray(startOffsets), toIntArray(endOffsets)); } else { // terms only assertAnalyzesToReuse(a, text, tokens.toArray(new String[tokens.size()])); } } } } static int[] toIntArray(List<Integer> list) { int ret[] = new int[list.size()]; int offset = 0; for (Integer i : list) { ret[offset++] = i; } return ret; } }