/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.core; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Modifier; import java.net.URI; import java.net.URL; import java.nio.CharBuffer; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.DateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import java.util.function.Function; import java.util.function.Predicate; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.CharArrayMap; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.CrankyTokenFilter; import org.apache.lucene.analysis.MockGraphTokenFilter; import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ValidatingTokenFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter; import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.path.PathHierarchyTokenizer; import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; import org.apache.lucene.analysis.payloads.IdentityEncoder; import org.apache.lucene.analysis.payloads.PayloadEncoder; import org.apache.lucene.analysis.snowball.TestSnowball; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.Version; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; import org.junit.AfterClass; import org.junit.BeforeClass; import org.tartarus.snowball.SnowballProgram; import org.xml.sax.InputSource; /** tests random analysis chains */ public class TestRandomChains extends BaseTokenStreamTestCase { static List<Constructor<? extends Tokenizer>> tokenizers; static List<Constructor<? extends TokenFilter>> tokenfilters; static List<Constructor<? extends CharFilter>> charfilters; private static final Predicate<Object[]> ALWAYS = (objects -> true); private static final Map<Constructor<?>,Predicate<Object[]>> brokenConstructors = new HashMap<>(); static { try { brokenConstructors.put( LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS); brokenConstructors.put( LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), args -> { assert args.length == 3; return !((Boolean) args[2]); // args are broken if consumeAllTokens is false }); brokenConstructors.put( LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS); brokenConstructors.put( LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), args -> { assert args.length == 3; return !((Boolean) args[2]); // args are broken if consumeAllTokens is false }); brokenConstructors.put( LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class), ALWAYS); brokenConstructors.put( LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), args -> { assert args.length == 3; return !((Boolean) args[2]); // args are broken if consumeAllTokens is false }); for (Class<?> c : Arrays.<Class<?>>asList( // TODO: can we promote some of these to be only // offsets offenders? // doesn't actual reset itself! CachingTokenFilter.class, // Not broken, simulates brokenness: CrankyTokenFilter.class, // Not broken: we forcefully add this, so we shouldn't // also randomly pick it: ValidatingTokenFilter.class, // TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain) WordDelimiterFilter.class, // Cannot correct offsets when a char filter had changed them: WordDelimiterGraphFilter.class, // clones of core's filters: org.apache.lucene.analysis.core.StopFilter.class, org.apache.lucene.analysis.core.LowerCaseFilter.class)) { for (Constructor<?> ctor : c.getConstructors()) { brokenConstructors.put(ctor, ALWAYS); } } } catch (Exception e) { throw new Error(e); } } // TODO: also fix these and remove (maybe): // Classes/options that don't produce consistent graph offsets: private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<>(); static { try { for (Class<?> c : Arrays.<Class<?>>asList( ReversePathHierarchyTokenizer.class, PathHierarchyTokenizer.class, // TODO: it seems to mess up offsets!? WikipediaTokenizer.class, // TODO: doesn't handle graph inputs CJKBigramFilter.class, // TODO: doesn't handle graph inputs (or even look at positionIncrement) HyphenatedWordsFilter.class, // TODO: LUCENE-4983 CommonGramsFilter.class, // TODO: doesn't handle graph inputs CommonGramsQueryFilter.class)) { for (Constructor<?> ctor : c.getConstructors()) { brokenOffsetsConstructors.put(ctor, ALWAYS); } } } catch (Exception e) { throw new Error(e); } } @BeforeClass public static void beforeClass() throws Exception { List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis"); tokenizers = new ArrayList<>(); tokenfilters = new ArrayList<>(); charfilters = new ArrayList<>(); for (final Class<?> c : analysisClasses) { final int modifiers = c.getModifiers(); if ( // don't waste time with abstract classes or deprecated known-buggy ones Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() || c.isAnnotationPresent(Deprecated.class) || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c)) ) { continue; } for (final Constructor<?> ctor : c.getConstructors()) { // don't test synthetic or deprecated ctors, they likely have known bugs: if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) { continue; } if (Tokenizer.class.isAssignableFrom(c)) { assertTrue(ctor.toGenericString() + " has unsupported parameter types", allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); tokenizers.add(castConstructor(Tokenizer.class, ctor)); } else if (TokenFilter.class.isAssignableFrom(c)) { assertTrue(ctor.toGenericString() + " has unsupported parameter types", allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); tokenfilters.add(castConstructor(TokenFilter.class, ctor)); } else if (CharFilter.class.isAssignableFrom(c)) { assertTrue(ctor.toGenericString() + " has unsupported parameter types", allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); charfilters.add(castConstructor(CharFilter.class, ctor)); } else { fail("Cannot get here"); } } } final Comparator<Constructor<?>> ctorComp = (arg0, arg1) -> arg0.toGenericString().compareTo(arg1.toGenericString()); Collections.sort(tokenizers, ctorComp); Collections.sort(tokenfilters, ctorComp); Collections.sort(charfilters, ctorComp); if (VERBOSE) { System.out.println("tokenizers = " + tokenizers); System.out.println("tokenfilters = " + tokenfilters); System.out.println("charfilters = " + charfilters); } } @AfterClass public static void afterClass() { tokenizers = null; tokenfilters = null; charfilters = null; } /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility. * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */ @SuppressWarnings("unchecked") private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) { return (Constructor<T>) ctor; } public static List<Class<?>> getClassesForPackage(String pckgname) throws Exception { final List<Class<?>> classes = new ArrayList<>(); collectClassesForPackage(pckgname, classes); assertFalse("No classes found in package '"+pckgname+"'; maybe your test classes are packaged as JAR file?", classes.isEmpty()); return classes; } private static void collectClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception { final ClassLoader cld = TestRandomChains.class.getClassLoader(); final String path = pckgname.replace('.', '/'); final Enumeration<URL> resources = cld.getResources(path); while (resources.hasMoreElements()) { final URI uri = resources.nextElement().toURI(); if (!"file".equalsIgnoreCase(uri.getScheme())) continue; final Path directory = Paths.get(uri); if (Files.exists(directory)) { try (DirectoryStream<Path> stream = Files.newDirectoryStream(directory)) { for (Path file : stream) { if (Files.isDirectory(file)) { // recurse String subPackage = pckgname + "." + file.getFileName().toString(); collectClassesForPackage(subPackage, classes); } String fname = file.getFileName().toString(); if (fname.endsWith(".class")) { String clazzName = fname.substring(0, fname.length() - 6); // exclude Test classes that happen to be in these packages. // class.ForName'ing some of them can cause trouble. if (!clazzName.endsWith("Test") && !clazzName.startsWith("Test")) { // Don't run static initializers, as we won't use most of them. // Java will do that automatically once accessed/instantiated. classes.add(Class.forName(pckgname + '.' + clazzName, false, cld)); } } } } } } } private static final Map<Class<?>,Function<Random,Object>> argProducers = new IdentityHashMap<Class<?>,Function<Random,Object>>() {{ put(int.class, random -> { // TODO: could cause huge ram usage to use full int range for some filters // (e.g. allocate enormous arrays) // return Integer.valueOf(random.nextInt()); return Integer.valueOf(TestUtil.nextInt(random, -50, 50)); }); put(char.class, random -> { // TODO: fix any filters that care to throw IAE instead. // also add a unicode validating filter to validate termAtt? // return Character.valueOf((char)random.nextInt(65536)); while(true) { char c = (char)random.nextInt(65536); if (c < '\uD800' || c > '\uDFFF') { return Character.valueOf(c); } } }); put(float.class, Random::nextFloat); put(boolean.class, Random::nextBoolean); put(byte.class, random -> (byte) random.nextInt(256)); put(byte[].class, random -> { byte bytes[] = new byte[random.nextInt(256)]; random.nextBytes(bytes); return bytes; }); put(Random.class, random -> new Random(random.nextLong())); put(Version.class, random -> Version.LATEST); put(AttributeFactory.class, BaseTokenStreamTestCase::newAttributeFactory); put(Set.class,random -> { // TypeTokenFilter Set<String> set = new HashSet<>(); int num = random.nextInt(5); for (int i = 0; i < num; i++) { set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]); } return set; }); put(Collection.class, random -> { // CapitalizationFilter Collection<char[]> col = new ArrayList<>(); int num = random.nextInt(5); for (int i = 0; i < num; i++) { col.add(TestUtil.randomSimpleString(random).toCharArray()); } return col; }); put(CharArraySet.class, random -> { int num = random.nextInt(10); CharArraySet set = new CharArraySet(num, random.nextBoolean()); for (int i = 0; i < num; i++) { // TODO: make nastier set.add(TestUtil.randomSimpleString(random)); } return set; }); // TODO: don't want to make the exponentially slow ones Dawid documents // in TestPatternReplaceFilter, so dont use truly random patterns (for now) put(Pattern.class, random -> Pattern.compile("a")); put(Pattern[].class, random -> new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")}); put(PayloadEncoder.class, random -> new IdentityEncoder()); // the other encoders will throw exceptions if tokens arent numbers? put(Dictionary.class, random -> { // TODO: make nastier InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff"); InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic"); try { return new Dictionary(new RAMDirectory(), "dictionary", affixStream, dictStream); } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code } }); put(HyphenationTree.class, random -> { // TODO: make nastier try { InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); return hyphenator; } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code } }); put(SnowballProgram.class, random -> { try { String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)]; Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class); return clazz.newInstance(); } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code } }); put(String.class, random -> { // TODO: make nastier if (random.nextBoolean()) { // a token type return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]; } else { return TestUtil.randomSimpleString(random); } }); put(NormalizeCharMap.class, random -> { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry Set<String> keys = new HashSet<>(); int num = random.nextInt(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { String key = TestUtil.randomSimpleString(random); if (!keys.contains(key) && key.length() > 0) { String value = TestUtil.randomSimpleString(random); builder.add(key, value); keys.add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return builder.build(); }); put(CharacterRunAutomaton.class, random -> { // TODO: could probably use a purely random automaton switch(random.nextInt(5)) { case 0: return MockTokenizer.KEYWORD; case 1: return MockTokenizer.SIMPLE; case 2: return MockTokenizer.WHITESPACE; case 3: return MockTokenFilter.EMPTY_STOPSET; default: return MockTokenFilter.ENGLISH_STOPSET; } }); put(CharArrayMap.class, random -> { int num = random.nextInt(10); CharArrayMap<String> map = new CharArrayMap<>(num, random.nextBoolean()); for (int i = 0; i < num; i++) { // TODO: make nastier map.put(TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random)); } return map; }); put(StemmerOverrideMap.class, random -> { int num = random.nextInt(10); StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(random.nextBoolean()); for (int i = 0; i < num; i++) { String input = ""; do { input = TestUtil.randomRealisticUnicodeString(random); } while(input.isEmpty()); String out = ""; TestUtil.randomSimpleString(random); do { out = TestUtil.randomRealisticUnicodeString(random); } while(out.isEmpty()); builder.add(input, out); } try { return builder.build(); } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code } }); put(SynonymMap.class, new Function<Random, Object>() { @Override public Object apply(Random random) { SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean()); } try { return b.build(); } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code } } private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) { b.add(new CharsRef(input.replaceAll(" +", "\u0000")), new CharsRef(output.replaceAll(" +", "\u0000")), keepOrig); } private String randomNonEmptyString(Random random) { while(true) { final String s = TestUtil.randomUnicodeString(random).trim(); if (s.length() != 0 && s.indexOf('\u0000') == -1) { return s; } } } }); put(DateFormat.class, random -> { if (random.nextBoolean()) return null; return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random)); }); put(Automaton.class, random -> { return Operations.determinize(new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES); }); }}; static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs; static { allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>()); allowedTokenizerArgs.addAll(argProducers.keySet()); allowedTokenizerArgs.add(Reader.class); allowedTokenizerArgs.add(AttributeFactory.class); allowedTokenizerArgs.add(AttributeSource.class); allowedTokenizerArgs.add(Automaton.class); allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>()); allowedTokenFilterArgs.addAll(argProducers.keySet()); allowedTokenFilterArgs.add(TokenStream.class); // TODO: fix this one, thats broken: allowedTokenFilterArgs.add(CommonGramsFilter.class); allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>()); allowedCharFilterArgs.addAll(argProducers.keySet()); allowedCharFilterArgs.add(Reader.class); } @SuppressWarnings("unchecked") static <T> T newRandomArg(Random random, Class<T> paramType) { final Function<Random,Object> producer = argProducers.get(paramType); assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer); return (T) producer.apply(random); } static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) { Object[] args = new Object[paramTypes.length]; for (int i = 0; i < args.length; i++) { Class<?> paramType = paramTypes[i]; if (paramType == AttributeSource.class) { // TODO: args[i] = new AttributeSource(); // this is currently too scary to deal with! args[i] = null; // force IAE } else { args[i] = newRandomArg(random, paramType); } } return args; } static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) { Object[] args = new Object[paramTypes.length]; for (int i = 0; i < args.length; i++) { Class<?> paramType = paramTypes[i]; if (paramType == Reader.class) { args[i] = reader; } else { args[i] = newRandomArg(random, paramType); } } return args; } static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) { Object[] args = new Object[paramTypes.length]; for (int i = 0; i < args.length; i++) { Class<?> paramType = paramTypes[i]; if (paramType == TokenStream.class) { args[i] = stream; } else if (paramType == CommonGramsFilter.class) { // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly args[i] = new CommonGramsFilter(stream, newRandomArg(random, CharArraySet.class)); } else { args[i] = newRandomArg(random, paramType); } } return args; } static class MockRandomAnalyzer extends Analyzer { final long seed; MockRandomAnalyzer(long seed) { this.seed = seed; } public boolean offsetsAreCorrect() { // TODO: can we not do the full chain here!? Random random = new Random(seed); TokenizerSpec tokenizerSpec = newTokenizer(random); TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect); return filterSpec.offsetsAreCorrect; } @Override protected TokenStreamComponents createComponents(String fieldName) { Random random = new Random(seed); TokenizerSpec tokenizerSpec = newTokenizer(random); //System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString); TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect); //System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString); return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream); } @Override protected Reader initReader(String fieldName, Reader reader) { Random random = new Random(seed); CharFilterSpec charfilterspec = newCharFilterChain(random, reader); return charfilterspec.reader; } @Override public String toString() { Random random = new Random(seed); StringBuilder sb = new StringBuilder(); CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader("")); sb.append("\ncharfilters="); sb.append(charFilterSpec.toString); // intentional: initReader gets its own separate random random = new Random(seed); TokenizerSpec tokenizerSpec = newTokenizer(random); sb.append("\n"); sb.append("tokenizer="); sb.append(tokenizerSpec.toString); TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect); sb.append("\n"); sb.append("filters="); sb.append(tokenFilterSpec.toString); sb.append("\n"); sb.append("offsetsAreCorrect="); sb.append(tokenFilterSpec.offsetsAreCorrect); return sb.toString(); } private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr) { try { final T instance = ctor.newInstance(args); /* if (descr.length() > 0) { descr.append(","); } */ descr.append("\n "); descr.append(ctor.getDeclaringClass().getName()); String params = Arrays.deepToString(args); params = params.substring(1, params.length()-1); descr.append("(").append(params).append(")"); return instance; } catch (InvocationTargetException ite) { final Throwable cause = ite.getCause(); if (cause instanceof IllegalArgumentException || cause instanceof UnsupportedOperationException) { // thats ok, ignore if (VERBOSE) { System.err.println("Ignoring IAE/UOE from ctor:"); cause.printStackTrace(System.err); } } else { Rethrow.rethrow(cause); } } catch (IllegalAccessException | InstantiationException iae) { Rethrow.rethrow(iae); } return null; // no success } private boolean broken(Constructor<?> ctor, Object[] args) { final Predicate<Object[]> pred = brokenConstructors.get(ctor); return pred != null && pred.test(args); } private boolean brokenOffsets(Constructor<?> ctor, Object[] args) { final Predicate<Object[]> pred = brokenOffsetsConstructors.get(ctor); return pred != null && pred.test(args); } // create a new random tokenizer from classpath private TokenizerSpec newTokenizer(Random random) { TokenizerSpec spec = new TokenizerSpec(); while (spec.tokenizer == null) { final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size())); final StringBuilder descr = new StringBuilder(); final Object args[] = newTokenizerArgs(random, ctor.getParameterTypes()); if (broken(ctor, args)) { continue; } spec.tokenizer = createComponent(ctor, args, descr); if (spec.tokenizer != null) { spec.offsetsAreCorrect &= !brokenOffsets(ctor, args); spec.toString = descr.toString(); } } return spec; } private CharFilterSpec newCharFilterChain(Random random, Reader reader) { CharFilterSpec spec = new CharFilterSpec(); spec.reader = reader; StringBuilder descr = new StringBuilder(); int numFilters = random.nextInt(3); for (int i = 0; i < numFilters; i++) { while (true) { final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size())); final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes()); if (broken(ctor, args)) { continue; } reader = createComponent(ctor, args, descr); if (reader != null) { spec.reader = reader; break; } } } spec.toString = descr.toString(); return spec; } private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) { TokenFilterSpec spec = new TokenFilterSpec(); spec.offsetsAreCorrect = offsetsAreCorrect; spec.stream = tokenizer; StringBuilder descr = new StringBuilder(); int numFilters = random.nextInt(5); for (int i = 0; i < numFilters; i++) { // Insert ValidatingTF after each stage so we can // catch problems right after the TF that "caused" // them: spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect); while (true) { final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size())); // hack: MockGraph/MockLookahead has assertions that will trip if they follow // an offsets violator. so we cant use them after e.g. wikipediatokenizer if (!spec.offsetsAreCorrect && (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class) || ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) { continue; } final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); if (broken(ctor, args)) { continue; } final TokenFilter flt = createComponent(ctor, args, descr); if (flt != null) { spec.offsetsAreCorrect &= !brokenOffsets(ctor, args); spec.stream = flt; break; } } } // Insert ValidatingTF after each stage so we can // catch problems right after the TF that "caused" // them: spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect); spec.toString = descr.toString(); return spec; } } static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter { boolean readSomething; CheckThatYouDidntReadAnythingReaderWrapper(Reader in) { super(in); } @Override public int correct(int currentOff) { return currentOff; // we don't change any offsets } @Override public int read(char[] cbuf, int off, int len) throws IOException { readSomething = true; return input.read(cbuf, off, len); } @Override public int read() throws IOException { readSomething = true; return input.read(); } @Override public int read(CharBuffer target) throws IOException { readSomething = true; return input.read(target); } @Override public int read(char[] cbuf) throws IOException { readSomething = true; return input.read(cbuf); } @Override public long skip(long n) throws IOException { readSomething = true; return input.skip(n); } @Override public void mark(int readAheadLimit) throws IOException { input.mark(readAheadLimit); } @Override public boolean markSupported() { return input.markSupported(); } @Override public boolean ready() throws IOException { return input.ready(); } @Override public void reset() throws IOException { input.reset(); } } static class TokenizerSpec { Tokenizer tokenizer; String toString; boolean offsetsAreCorrect = true; } static class TokenFilterSpec { TokenStream stream; String toString; boolean offsetsAreCorrect = true; } static class CharFilterSpec { Reader reader; String toString; } public void testRandomChains() throws Throwable { int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; Random random = random(); for (int i = 0; i < numIterations; i++) { try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) { if (VERBOSE) { System.out.println("Creating random analyzer:" + a); } try { checkNormalize(a); checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false, false /* We already validate our own offsets... */); } catch (Throwable e) { System.err.println("Exception from random analyzer: " + a); throw e; } } } } public void checkNormalize(Analyzer a) { // normalization should not modify characters that may be used for wildcards // or regular expressions String s = "([0-9]+)?*"; assertEquals(s, a.normalize("dummy", s).utf8ToString()); } // we might regret this decision... public void testRandomChainsWithLargeStrings() throws Throwable { int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; Random random = random(); for (int i = 0; i < numIterations; i++) { try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) { if (VERBOSE) { System.out.println("Creating random analyzer:" + a); } try { checkRandomData(random, a, 50*RANDOM_MULTIPLIER, 80, false, false /* We already validate our own offsets... */); } catch (Throwable e) { System.err.println("Exception from random analyzer: " + a); throw e; } } } } }