/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.concordance; import java.io.IOException; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.automaton.CharacterRunAutomaton; public class ConcordanceTestBase extends LuceneTestCase { protected final static String FIELD = "f1"; public static Analyzer getAnalyzer(final CharacterRunAutomaton stops) { return getAnalyzer(stops, random().nextInt(10000), random().nextInt(10000)); } public static Analyzer getAnalyzer(final CharacterRunAutomaton stops, final int posIncGap, final int charOffsetGap) { return new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); TokenFilter filter = new MockTokenFilter(tokenizer, stops); return new TokenStreamComponents(tokenizer, filter); } @Override public int getPositionIncrementGap(String fieldName) { return posIncGap; } @Override public int getOffsetGap(String fieldName) { return charOffsetGap; } }; } public Directory getDirectory(Analyzer analyzer, String[] vals) throws IOException { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(analyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy())); for (String s : vals) { Document d = new Document(); d.add(newTextField(FIELD, s, Field.Store.YES)); writer.addDocument(d); } writer.close(); return directory; } public Directory getDirectory(Analyzer analyzer, List<String[]> input) throws IOException { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(analyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy())); for (String[] vals : input) { Document d = new Document(); for (String s : vals) { d.add(newTextField(FIELD, s, Field.Store.YES)); } writer.addDocument(d); } writer.close(); return directory; } Directory buildNeedleIndex(String needle, Analyzer analyzer, int numFieldValues) throws Exception { IndexWriterConfig config = newIndexWriterConfig(random(), analyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy()); Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config); //create document with multivalued field String[] fs = new String[numFieldValues]; for (int i = 0; i < numFieldValues; i++) { float r = random().nextFloat(); String doc = ""; if (r <= 0.33) { doc = needle + " " + getRandomWords(29, needle, analyzer); } else if (r <= 0.66) { doc = getRandomWords(13, needle, analyzer) + " " + needle + " " + getRandomWords(17, needle, analyzer); } else { doc = getRandomWords(31, needle, analyzer) + " " + needle; } fs[i] = doc; } Document d = new Document(); FieldType type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setStored(true); type.setTokenized(true); //IndexableField field = new IndexableField(type); for (String s : fs) { d.add(newField(random(), FIELD, s, type)); } writer.addDocument(d); writer.close(); return directory; } /** * this assumes no stop filter in the analyzer. * Best to use whitespace tokenizer. */ private String getRandomWords(int numWords, String needle, Analyzer analyzer) throws Exception { StringBuilder sb = new StringBuilder(); for (int i = 0; i < numWords; i++) { sb.append(TestUtil.randomUnicodeString(random(), 31)); sb.append(" "); } List<String> terms = SimpleAnalyzerUtil.getTermStrings(sb.toString(),FIELD, analyzer); StringBuilder rsb = new StringBuilder(); int words = -1; while (words++ < numWords && words < terms.size()) { String cand = terms.get(words); if (!needle.equals(cand)) { if (words > 0) { rsb.append(" "); } rsb.append(cand); } } return rsb.toString(); } String getNeedle(Analyzer analyzer) { //try to get a term that would come out of the analyzer for (int i = 0; i < 10; i++) { //start with a random base string String baseString = TestUtil.randomUnicodeString(random(), random().nextInt(10) + 2); try { //run it through the analyzer, and take the first thing //that comes out of it if the length > 0 List<String> terms = SimpleAnalyzerUtil.getTermStrings(baseString, FIELD, analyzer); for (String t : terms) { if (t.length() > 0) { return t; } } } catch (IOException e) { //swallow } } //if nothing is found in 10 tries, //return literal string "needle" return "needle"; } }