/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.shingle; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; /** * A test class for ShingleAnalyzerWrapper as regards queries and scoring. */ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { private Analyzer analyzer; private IndexSearcher searcher; private IndexReader reader; private Directory directory; /** * Set up a new index in RAM with three test phrases and the supplied Analyzer. * * @throws Exception if an error occurs with index writer or searcher */ @Override public void setUp() throws Exception { super.setUp(); analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 2); directory = newDirectory(); IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer)); Document doc; doc = new Document(); doc.add(new TextField("content", "please divide this sentence into shingles", Field.Store.YES)); writer.addDocument(doc); doc = new Document(); doc.add(new TextField("content", "just another test sentence", Field.Store.YES)); writer.addDocument(doc); doc = new Document(); doc.add(new TextField("content", "a sentence which contains no test", Field.Store.YES)); writer.addDocument(doc); writer.close(); reader = DirectoryReader.open(directory); searcher = newSearcher(reader); } @Override public void tearDown() throws Exception { reader.close(); directory.close(); analyzer.close(); super.tearDown(); } protected void compareRanks(ScoreDoc[] hits, int[] ranks) throws Exception { assertEquals(ranks.length, hits.length); for (int i = 0; i < ranks.length; i++) { assertEquals(ranks[i], hits[i].doc); } } /* * This shows how to construct a phrase query containing shingles. */ public void testShingleAnalyzerWrapperPhraseQuery() throws Exception { PhraseQuery.Builder builder = new PhraseQuery.Builder(); try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) { int j = -1; PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { j += posIncrAtt.getPositionIncrement(); String termText = termAtt.toString(); builder.add(new Term("content", termText), j); } ts.end(); } PhraseQuery q = builder.build(); ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs; int[] ranks = new int[] { 0 }; compareRanks(hits, ranks); } /* * How to construct a boolean query with shingles. A query like this will * implicitly score those documents higher that contain the words in the query * in the right order and adjacent to each other. */ public void testShingleAnalyzerWrapperBooleanQuery() throws Exception { BooleanQuery.Builder q = new BooleanQuery.Builder(); try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String termText = termAtt.toString(); q.add(new TermQuery(new Term("content", termText)), BooleanClause.Occur.SHOULD); } ts.end(); } ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs; int[] ranks = new int[] { 1, 2, 0 }; compareRanks(hits, ranks); } public void testReusableTokenStream() throws Exception { Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 2); assertAnalyzesTo(a, "please divide into shingles", new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); assertAnalyzesTo(a, "divide me up again", new String[] { "divide", "divide me", "me", "me up", "up", "up again", "again" }, new int[] { 0, 0, 7, 7, 10, 10, 13 }, new int[] { 6, 9, 9, 12, 12, 18, 18 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); a.close(); } public void testNonDefaultMinShingleSize() throws Exception { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4); assertAnalyzesTo(analyzer, "please divide this sentence into shingles", new String[] { "please", "please divide this", "please divide this sentence", "divide", "divide this sentence", "divide this sentence into", "this", "this sentence into", "this sentence into shingles", "sentence", "sentence into shingles", "into", "shingles" }, new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 }); analyzer.close(); analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide this sentence into shingles", new String[] { "please divide this", "please divide this sentence", "divide this sentence", "divide this sentence into", "this sentence into", "this sentence into shingles", "sentence into shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 18, 27, 27, 32, 32, 41, 41 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); analyzer.close(); } public void testNonDefaultMinAndSameMaxShingleSize() throws Exception { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3); assertAnalyzesTo(analyzer, "please divide this sentence into shingles", new String[] { "please", "please divide this", "divide", "divide this sentence", "this", "this sentence into", "sentence", "sentence into shingles", "into", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 }, new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 }); analyzer.close(); analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide this sentence into shingles", new String[] { "please divide this", "divide this sentence", "this sentence into", "sentence into shingles" }, new int[] { 0, 7, 14, 19 }, new int[] { 18, 27, 32, 41 }, new int[] { 1, 1, 1, 1 }); analyzer.close(); } public void testNoTokenSeparator() throws Exception { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "", true, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", "into", "intoshingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); analyzer.close(); analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "", false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", "intoshingles" }, new int[] { 0, 7, 14 }, new int[] { 13, 18, 27 }, new int[] { 1, 1, 1 }); analyzer.close(); } public void testNullTokenSeparator() throws Exception { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, null, true, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", "into", "intoshingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); analyzer.close(); analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "", false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", "intoshingles" }, new int[] { 0, 7, 14 }, new int[] { 13, 18, 27 }, new int[] { 1, 1, 1 }); analyzer.close(); } public void testAltTokenSeparator() throws Exception { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", true, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "please<SEP>divide", "divide", "divide<SEP>into", "into", "into<SEP>shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); analyzer.close(); analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please<SEP>divide", "divide<SEP>into", "into<SEP>shingles" }, new int[] { 0, 7, 14 }, new int[] { 13, 18, 27 }, new int[] { 1, 1, 1 }); analyzer.close(); } public void testAltFillerToken() throws Exception { Analyzer delegate = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { CharArraySet stopSet = StopFilter.makeStopSet("into"); Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenFilter filter = new StopFilter(tokenizer, stopSet); return new TokenStreamComponents(tokenizer, filter); } }; ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--"); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 }); analyzer.close(); delegate = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { CharArraySet stopSet = StopFilter.makeStopSet("into"); Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenFilter filter = new StopFilter(tokenizer, stopSet); return new TokenStreamComponents(tokenizer, filter); } }; analyzer = new ShingleAnalyzerWrapper( delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 }); analyzer.close(); delegate = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { CharArraySet stopSet = StopFilter.makeStopSet("into"); Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenFilter filter = new StopFilter(tokenizer, stopSet); return new TokenStreamComponents(tokenizer, filter); } }; analyzer = new ShingleAnalyzerWrapper( delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ""); assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 }); analyzer.close(); } public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "", false, true, ShingleFilter.DEFAULT_FILLER_TOKEN); assertAnalyzesTo(analyzer, "please", new String[] { "please" }, new int[] { 0 }, new int[] { 6 }, new int[] { 1 }); analyzer.close(); } }