/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.Reader; import java.io.StringReader; import java.util.Arrays; import java.util.Random; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; public class TestMockAnalyzer extends BaseTokenStreamTestCase { /** Test a configuration that behaves a lot like WhitespaceAnalyzer */ public void testWhitespace() throws Exception { Analyzer a = new MockAnalyzer(random()); assertAnalyzesTo(a, "A bc defg hiJklmn opqrstuv wxy z ", new String[] { "a", "bc", "defg", "hijklmn", "opqrstuv", "wxy", "z" }); assertAnalyzesTo(a, "aba cadaba shazam", new String[] { "aba", "cadaba", "shazam" }); assertAnalyzesTo(a, "break on whitespace", new String[] { "break", "on", "whitespace" }); } /** Test a configuration that behaves a lot like SimpleAnalyzer */ public void testSimple() throws Exception { Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); assertAnalyzesTo(a, "a-bc123 defg+hijklmn567opqrstuv78wxy_z ", new String[] { "a", "bc", "defg", "hijklmn", "opqrstuv", "wxy", "z" }); assertAnalyzesTo(a, "aba4cadaba-Shazam", new String[] { "aba", "cadaba", "shazam" }); assertAnalyzesTo(a, "break+on/Letters", new String[] { "break", "on", "letters" }); } /** Test a configuration that behaves a lot like KeywordAnalyzer */ public void testKeyword() throws Exception { Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); assertAnalyzesTo(a, "a-bc123 defg+hijklmn567opqrstuv78wxy_z ", new String[] { "a-bc123 defg+hijklmn567opqrstuv78wxy_z " }); assertAnalyzesTo(a, "aba4cadaba-Shazam", new String[] { "aba4cadaba-Shazam" }); assertAnalyzesTo(a, "break+on/Nothing", new String[] { "break+on/Nothing" }); // currently though emits no tokens for empty string: maybe we can do it, // but we don't want to emit tokens infinitely... assertAnalyzesTo(a, "", new String[0]); } // Test some regular expressions as tokenization patterns /** Test a configuration where each character is a term */ public void testSingleChar() throws Exception { CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp(".").toAutomaton()); Analyzer a = new MockAnalyzer(random(), single, false); assertAnalyzesTo(a, "foobar", new String[] { "f", "o", "o", "b", "a", "r" }, new int[] { 0, 1, 2, 3, 4, 5 }, new int[] { 1, 2, 3, 4, 5, 6 } ); checkRandomData(random(), a, 100); } /** Test a configuration where two characters makes a term */ public void testTwoChars() throws Exception { CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("..").toAutomaton()); Analyzer a = new MockAnalyzer(random(), single, false); assertAnalyzesTo(a, "foobar", new String[] { "fo", "ob", "ar"}, new int[] { 0, 2, 4 }, new int[] { 2, 4, 6 } ); // make sure when last term is a "partial" match that end() is correct assertTokenStreamContents(a.tokenStream("bogus", "fooba"), new String[] { "fo", "ob" }, new int[] { 0, 2 }, new int[] { 2, 4 }, new int[] { 1, 1 }, new Integer(5) ); checkRandomData(random(), a, 100); } /** Test a configuration where three characters makes a term */ public void testThreeChars() throws Exception { CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("...").toAutomaton()); Analyzer a = new MockAnalyzer(random(), single, false); assertAnalyzesTo(a, "foobar", new String[] { "foo", "bar"}, new int[] { 0, 3 }, new int[] { 3, 6 } ); // make sure when last term is a "partial" match that end() is correct assertTokenStreamContents(a.tokenStream("bogus", "fooba"), new String[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new Integer(5) ); checkRandomData(random(), a, 100); } /** Test a configuration where word starts with one uppercase */ public void testUppercase() throws Exception { CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton()); Analyzer a = new MockAnalyzer(random(), single, false); assertAnalyzesTo(a, "FooBarBAZ", new String[] { "Foo", "Bar", "B", "A", "Z"}, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 } ); assertAnalyzesTo(a, "aFooBar", new String[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 } ); checkRandomData(random(), a, 100); } /** Test a configuration that behaves a lot like StopAnalyzer */ public void testStop() throws Exception { Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); assertAnalyzesTo(a, "the quick brown a fox", new String[] { "quick", "brown", "fox" }, new int[] { 2, 1, 2 }); } /** Test a configuration that behaves a lot like KeepWordFilter */ public void testKeep() throws Exception { CharacterRunAutomaton keepWords = new CharacterRunAutomaton( Operations.complement( Operations.union( Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))), DEFAULT_MAX_DETERMINIZED_STATES)); Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords); assertAnalyzesTo(a, "quick foo brown bar bar fox foo", new String[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); } /** Test a configuration that behaves a lot like LengthFilter */ public void testLength() throws Exception { CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton()); Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5); assertAnalyzesTo(a, "ok toolong fine notfine", new String[] { "ok", "fine" }, new int[] { 1, 2 }); } /** Test MockTokenizer encountering a too long token */ public void testTooLongToken() throws Exception { Analyzer whitespace = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false, 5); return new TokenStreamComponents(t, t); } }; assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolong ok "), new String[] { "test", "123", "toolo", "ng", "ok" }, new int[] { 0, 5, 9, 14, 17 }, new int[] { 4, 8, 14, 16, 19 }, new Integer(20)); assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolo"), new String[] { "test", "123", "toolo" }, new int[] { 0, 5, 9 }, new int[] { 4, 8, 14 }, new Integer(14)); } public void testLUCENE_3042() throws Exception { String testString = "t"; Analyzer analyzer = new MockAnalyzer(random()); try (TokenStream stream = analyzer.tokenStream("dummy", testString)) { stream.reset(); while (stream.incrementToken()) { // consume } stream.end(); } assertAnalyzesTo(analyzer, testString, new String[] { "t" }); } /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), new MockAnalyzer(random()), atLeast(1000)); } /** blast some random strings through differently configured tokenizers */ public void testRandomRegexps() throws Exception { int iters = TEST_NIGHTLY ? atLeast(30) : atLeast(1); for (int i = 0; i < iters; i++) { final CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.randomAutomaton(random()), Integer.MAX_VALUE); final boolean lowercase = random().nextBoolean(); final int limit = TestUtil.nextInt(random(), 0, 500); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(dfa, lowercase, limit); return new TokenStreamComponents(t, t); } }; checkRandomData(random(), a, 100); a.close(); } } public void testForwardOffsets() throws Exception { int num = atLeast(1000); for (int i = 0; i < num; i++) { String s = TestUtil.randomHtmlishString(random(), 20); StringReader reader = new StringReader(s); MockCharFilter charfilter = new MockCharFilter(reader, 2); MockAnalyzer analyzer = new MockAnalyzer(random()); try (TokenStream ts = analyzer.tokenStream("bogus", charfilter)) { ts.reset(); while (ts.incrementToken()) { ; } ts.end(); } } } public void testWrapReader() throws Exception { // LUCENE-5153: test that wrapping an analyzer's reader is allowed final Random random = random(); final Analyzer delegate = new MockAnalyzer(random); Analyzer a = new AnalyzerWrapper(delegate.getReuseStrategy()) { @Override protected Reader wrapReader(String fieldName, Reader reader) { return new MockCharFilter(reader, 7); } @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return delegate; } }; checkOneTerm(a, "abc", "aabc"); } public void testChangeGaps() throws Exception { // LUCENE-5324: check that it is possible to change the wrapper's gaps final int positionGap = random().nextInt(1000); final int offsetGap = random().nextInt(1000); final Analyzer delegate = new MockAnalyzer(random()); final Analyzer a = new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) { @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return delegate; } @Override public int getPositionIncrementGap(String fieldName) { return positionGap; } @Override public int getOffsetGap(String fieldName) { return offsetGap; } }; final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a); final Document doc = new Document(); final FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS); ft.setTokenized(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); doc.add(new Field("f", "a", ft)); doc.add(new Field("f", "a", ft)); writer.addDocument(doc); final LeafReader reader = getOnlyLeafReader(writer.getReader()); final Fields fields = reader.getTermVectors(0); final Terms terms = fields.terms("f"); final TermsEnum te = terms.iterator(); assertEquals(new BytesRef("a"), te.next()); final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL); assertEquals(0, dpe.nextDoc()); assertEquals(2, dpe.freq()); assertEquals(0, dpe.nextPosition()); assertEquals(0, dpe.startOffset()); final int endOffset = dpe.endOffset(); assertEquals(1 + positionGap, dpe.nextPosition()); assertEquals(1 + endOffset + offsetGap, dpe.endOffset()); assertEquals(null, te.next()); reader.close(); writer.close(); writer.w.getDirectory().close(); } }