/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queryparser.spans;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.en.KStemFilterFactory;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanBoostQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestAdvancedAnalyzers extends SQPTestBase {
private static final String FIELD1 = "f1";
private static final String FIELD2 = "f2";
private static final String FIELD3 = "f3";
private static final String FIELD4 = "f4";
private static Directory directory;
private static Analyzer synAnalyzer;
private static Analyzer baseAnalyzer;
private static Analyzer ucVowelAnalyzer;
private static Analyzer ucVowelMTAnalyzer;
private static Analyzer lcMultiTermAnalyzer;
private static Analyzer complexAnalyzer;
// private static final CharacterRunAutomaton STOP_WORDS = new CharacterRunAutomaton(
// BasicOperations.union(Arrays.asList(makeString("a"), makeString("an"))));
@BeforeClass
public static void beforeClass() throws Exception {
lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true);
complexAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
Map<String, String> attrs = new HashMap<>();
attrs.put("generateWordParts", "1");
attrs.put("generateNumberParts","1");
attrs.put("catenateWords","1");
attrs.put("catenateNumbers","1");
attrs.put("catenateAll","1");
attrs.put("splitOnCaseChange", "1");
attrs.put("preserveOriginal", "1");
TokenFilter filter = new WordDelimiterFilterFactory(attrs).create(tokenizer);
filter = new KStemFilterFactory(new HashMap<String, String>()).create(filter);
filter = new RemoveDuplicatesTokenFilterFactory(new HashMap<String, String>()).create(filter);
return new TokenStreamComponents(tokenizer, filter);
}
};
synAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE,
true);
TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);
filter = new MockSynFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
};
baseAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE,
true);
TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
ucVowelAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE,
true);
TokenFilter filter = new MockUCVowelFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
ucVowelMTAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD,
true);
TokenFilter filter = new MockUCVowelFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
Analyzer tmpUCVowelAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE,
true);
TokenFilter filter = new MockUCVowelFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig(baseAnalyzer)
.setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
.setMergePolicy(newLogMergePolicy()));
String[] docs = new String[]{
"abc_def",
"lmnop",
"abc one",
"abc two",
"qrs one",
"qrs two",
"tuv one",
"tuv two",
"qrs tuv",
"qrs_tuv"
};
for (int i = 0; i < docs.length; i++) {
Document doc = new Document();
doc.add(newTextField(FIELD1, docs[i], Field.Store.YES));
TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES);
tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i]));
doc.add(tf);
doc.add(newTextField(FIELD3, docs[i], Field.Store.YES));
TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES);
tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i]));
doc.add(tf4);
writer.addDocument(doc);
}
reader = writer.getReader();
searcher = newSearcher(reader);
writer.close();
}
@AfterClass
public static void afterClass() throws Exception {
reader.close();
directory.close();
reader = null;
directory = null;
synAnalyzer = null;
baseAnalyzer = null;
}
public void testSynBasic() throws Exception {
SpanQueryParser p = new SpanQueryParser(FIELD1, synAnalyzer, synAnalyzer);
countSpansDocs(p, FIELD1, "tuv", 4, 4);
countSpansDocs(p, FIELD1, "abc", 11, 9);
countSpansDocs(p, FIELD1, "\"abc one\"", 3, 3 );
}
@Test
public void testNonWhiteSpace() throws Exception {
SpanQueryParser p = new SpanQueryParser(FIELD1, baseAnalyzer, baseAnalyzer);
String s = "[zqx_qrs^3.0]~3^2";
Query q = p.parse(s);
assertTrue(q instanceof SpanBoostQuery);
assertTrue(((SpanBoostQuery)q).getQuery() instanceof SpanNearQuery);
SpanNearQuery near = (SpanNearQuery) ((SpanBoostQuery)q).getQuery();
SpanQuery[] clauses = near.getClauses();
assertEquals(2, clauses.length);
assertEquals(3, near.getSlop());
assertTrue(clauses[0] instanceof SpanTermQuery);
assertTrue(clauses[1] instanceof SpanTermQuery);
assertEquals("zqx", ((SpanTermQuery) clauses[0]).getTerm().text());
assertEquals("qrs", ((SpanTermQuery) clauses[1]).getTerm().text());
//take the boost from the phrase, ignore boost on term
//not necessarily right choice, but this is how it works now
assertEquals(2.0f, ((SpanBoostQuery)q).getBoost(), 0.00001f);
s = "[zqx2_qrs3 lmnop]~3";
p.setAutoGeneratePhraseQueries(true);
q = p.parse(s);
assertTrue(q instanceof SpanQuery);
assertTrue(q instanceof SpanNearQuery);
near = (SpanNearQuery) q;
clauses = near.getClauses();
assertEquals(2, clauses.length);
assertEquals(3, near.getSlop());
assertTrue(clauses[0] instanceof SpanNearQuery);
assertTrue(clauses[1] instanceof SpanTermQuery);
SpanNearQuery child = (SpanNearQuery) clauses[0];
SpanQuery[] childClauses = child.getClauses();
assertEquals(2, childClauses.length);
assertEquals("zqx", ((SpanTermQuery) childClauses[0]).getTerm().text());
assertEquals("qrs", ((SpanTermQuery) childClauses[1]).getTerm().text());
assertTrue(child.isInOrder());
assertEquals(child.getSlop(), 0);
}
//test different initializations/settings with multifield analyzers
public void testAnalyzerCombos() throws Exception{
//basic, correct set up
SpanQueryParser p = new SpanQueryParser(FIELD1, baseAnalyzer, lcMultiTermAnalyzer);
assertEquals(1, countDocs(p.getField(), p.parse("lmnop")));
assertEquals(1, countDocs(p.getField(), p.parse("lm*op")));
assertEquals(1, countDocs(p.getField(), p.parse("LMNOP")));
assertEquals(1, countDocs(p.getField(), p.parse("LM*OP")));
//basic, correct set up
p = new SpanQueryParser(FIELD2, ucVowelAnalyzer, lcMultiTermAnalyzer);
assertEquals(1, countDocs(p.getField(), p.parse("lmnop")));
assertEquals(1, countDocs(p.getField(), p.parse("LMNOP")));
assertEquals(0, countDocs(p.getField(), p.parse("LM*OP")));
//set to lowercase only, won't analyze
assertEquals(0, countDocs(p.getField(), p.parse("lm*op")));
p = new SpanQueryParser(FIELD2, ucVowelAnalyzer, ucVowelMTAnalyzer);
assertEquals(1, countDocs(p.getField(), p.parse("lm*op")));
assertEquals(1, countDocs(p.getField(), p.parse("LM*OP")));
//try sister field, to prove that default analyzer is ucVowelAnalyzer for
//unspecified fieldsd
assertEquals(1, countDocs(FIELD4, p.parse(FIELD4+":lmnop")));
assertEquals(1, countDocs(FIELD4, p.parse(FIELD4+":lm*op")));
assertEquals(1, countDocs(FIELD4, p.parse(FIELD4+":LMNOP")));
assertEquals(1, countDocs(FIELD4, p.parse(FIELD4+":LM*OP")));
//try mismatching sister field
assertEquals(0, countDocs(FIELD3, p.parse(FIELD3+":lmnop")));
assertEquals(0, countDocs(FIELD3, p.parse(FIELD3+":lm*op")));
assertEquals(0, countDocs(FIELD3, p.parse(FIELD3+":LMNOP")));
assertEquals(0, countDocs(FIELD3, p.parse(FIELD3+":LM*OP")));
p = new SpanQueryParser(FIELD1, baseAnalyzer, ucVowelMTAnalyzer);
assertEquals(1, countDocs(FIELD1, p.parse("lmnop")));
assertEquals(1, countDocs(FIELD1, p.parse("LMNOP")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lm*op")));
//advanced, correct set up for both
p = new SpanQueryParser(FIELD2, ucVowelAnalyzer, ucVowelMTAnalyzer);
assertEquals(1, countDocs(FIELD2, p.parse("lmnop")));
assertEquals(1, countDocs(FIELD2, p.parse("LMNOP")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lmnop")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LMNOP")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lm*op")));
p = new SpanQueryParser(FIELD2, ucVowelAnalyzer, null);
assertEquals(1, countDocs(FIELD2, p.parse("lmnop")));
//analyzer still used on whole terms; don't forget!
assertEquals(1, countDocs(FIELD2, p.parse("LMNOP")));
assertEquals(0, countDocs(FIELD2, p.parse("LM*OP")));
p = new SpanQueryParser(FIELD2, ucVowelAnalyzer, lcMultiTermAnalyzer);
assertEquals(1, countDocs(FIELD2, p.parse("lmnop")));
assertEquals(1, countDocs(FIELD2, p.parse("LMNOP")));
assertEquals(0, countDocs(FIELD2, p.parse("LM*OP")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LM*OP")));
//mismatch between default field and default analyzer; should return 0
p = new SpanQueryParser(FIELD1, ucVowelAnalyzer, ucVowelMTAnalyzer);
assertEquals(0, countDocs(FIELD2, p.parse("lmnop")));
assertEquals(0, countDocs(FIELD2, p.parse("LMNOP")));
assertEquals(0, countDocs(FIELD2, p.parse("lmnOp")));
p = new SpanQueryParser(FIELD1, baseAnalyzer, ucVowelMTAnalyzer);
//cstr with two analyzers sets normMultiTerms = NORM_MULTI_TERM.ANALYZE
//can't find any in field1 because these trigger multiTerm analysis
assertEquals(0, countDocs(FIELD1, p.parse(FIELD1+":lm*op")));
assertEquals(0, countDocs(FIELD1, p.parse(FIELD1+":lmno*")));
assertEquals(0, countDocs(FIELD1, p.parse(FIELD1+":lmmop~1")));
assertEquals(0, countDocs(FIELD1, p.parse(FIELD1+":LM*OP")));
assertEquals(0, countDocs(FIELD1, p.parse(FIELD1+":LMNO*")));
assertEquals(0, countDocs(FIELD1, p.parse(FIELD1+":LMMOP~1")));
//can find these in field2 because of multiterm analysis
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lm*op")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lmno*")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lmmop~1")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LM*OP")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LMNO*")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LMMOP~1")));
//try basic use case
p = new SpanQueryParser(FIELD1, baseAnalyzer, lcMultiTermAnalyzer);
//can't find these in field2 because multiterm analysis is using baseAnalyzer
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":lm*op")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":lmno*")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":lmmop~1")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LM*OP")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LMNO*")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LMMOP~1")));
p = new SpanQueryParser(FIELD1, ucVowelAnalyzer, ucVowelMTAnalyzer);
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lmnop")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lm*op")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lmno*")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lmmop~1")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LMNOP")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LM*OP")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LMNO*")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LMMOP~1")));
//now try adding the wrong analyzer for the whole term, but the
//right multiterm analyzer
p = new SpanQueryParser(FIELD2, baseAnalyzer, ucVowelMTAnalyzer);
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":lmnop")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lm*op")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lmno*")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":lmmop~1")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LMNOP")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LM*OP")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LMNO*")));
assertEquals(1, countDocs(FIELD2, p.parse(FIELD2+":LMMOP~1")));
//now set them completely improperly
p = new SpanQueryParser(FIELD2, baseAnalyzer, lcMultiTermAnalyzer);
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":lmnop")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":lm*op")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":lmno*")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":lmmop~1")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LMNOP")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LM*OP")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LMNO*")));
assertEquals(0, countDocs(FIELD2, p.parse(FIELD2+":LMMOP~1")));
}
/**
* Mocks a synonym filter. When it encounters "abc" it adds "qrs" and "tuv"
*/
private final static class MockSynFilter extends TokenFilter {
private final CharTermAttribute termAtt;
private final PositionIncrementAttribute posIncrAtt;
private final List<String> synBuffer = new LinkedList<String>();
public MockSynFilter(TokenStream in) {
super(in);
termAtt = addAttribute(CharTermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
if (synBuffer.size() > 0) {
termAtt.setEmpty().append(synBuffer.remove(0));
posIncrAtt.setPositionIncrement(0);
return true;
} else {
boolean next = input.incrementToken();
if (!next) {
return false;
}
String text = termAtt.toString();
if (text.equals("abc")) {
synBuffer.add("qrs");
synBuffer.add("tuv");
}
return true;
}
}
}
/*
* Mocks what happens in a non-whitespace language. Tokenizes on white space and "_".
*/
private final static class MockNonWhitespaceFilter extends TokenFilter {
private final CharTermAttribute termAtt;
private final List<String> buffer = new LinkedList<String>();
public MockNonWhitespaceFilter(TokenStream in) {
super(in);
termAtt = addAttribute(CharTermAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
if (buffer.size() > 0) {
termAtt.setEmpty().append(buffer.remove(0));
return true;
} else {
boolean next = input.incrementToken();
if (!next) {
return false;
}
String text = termAtt.toString();
String[] bits = text.split("_");
String ret = text;
if (bits.length > 1) {
ret = bits[0];
for (int i = 1; i < bits.length; i++) {
buffer.add(bits[i]);
}
}
termAtt.setEmpty().append(ret);
return true;
}
}
}
//mocks uppercasing vowels to test different analyzers for different fields
private final static class MockUCVowelFilter extends TokenFilter {
private final Pattern PATTERN = Pattern.compile("([aeiou])");
private final CharTermAttribute termAtt;
public MockUCVowelFilter(TokenStream in) {
super(in);
termAtt = addAttribute(CharTermAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
boolean next = input.incrementToken();
if (!next) {
return false;
}
String text = termAtt.toString().toLowerCase();
Matcher m = PATTERN.matcher(text);
StringBuffer sb = new StringBuffer();
while (m.find()) {
m.appendReplacement(sb, m.group(1).toUpperCase());
}
m.appendTail(sb);
text = sb.toString();
termAtt.setEmpty().append(text);
return true;
}
@Override
public void reset() throws IOException {
super.reset();
}
}
}