package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import java.util.Collection; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; public abstract class AbstractTestCase extends LuceneTestCase { protected final String F = "f"; protected final String F1 = "f1"; protected final String F2 = "f2"; protected Directory dir; protected Analyzer analyzerW; protected Analyzer analyzerB; protected Analyzer analyzerK; protected IndexReader reader; protected QueryParser paW; protected QueryParser paB; protected static final String[] shortMVValues = { "", "", "a b c", "", // empty data in multi valued field "d e" }; protected static final String[] longMVValues = { "Followings are the examples of customizable parameters and actual examples of customization:", "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically" }; // test data for LUCENE-1448 bug protected static final String[] biMVValues = { "\nLucene/Solr does not require such additional hardware.", "\nWhen you talk about processing speed, the" }; protected static final String[] strMVValues = { "abc", "defg", "hijkl" }; @Override protected void setUp() throws Exception { super.setUp(); analyzerW = new MockAnalyzer(MockTokenizer.WHITESPACE, false); analyzerB = new BigramAnalyzer(); analyzerK = new MockAnalyzer(MockTokenizer.KEYWORD, false); paW = new QueryParser(TEST_VERSION_CURRENT, F, analyzerW ); paB = new QueryParser(TEST_VERSION_CURRENT, F, analyzerB ); dir = newDirectory(newRandom()); } @Override protected void tearDown() throws Exception { if( reader != null ){ reader.close(); reader = null; } dir.close(); super.tearDown(); } protected Query tq( String text ){ return tq( 1F, text ); } protected Query tq( float boost, String text ){ return tq( boost, F, text ); } protected Query tq( String field, String text ){ return tq( 1F, field, text ); } protected Query tq( float boost, String field, String text ){ Query query = new TermQuery( new Term( field, text ) ); query.setBoost( boost ); return query; } protected Query pqF( String... texts ){ return pqF( 1F, texts ); } protected Query pqF( float boost, String... texts ){ return pqF( boost, 0, texts ); } protected Query pqF( float boost, int slop, String... texts ){ return pq( boost, slop, F, texts ); } protected Query pq( String field, String... texts ){ return pq( 1F, 0, field, texts ); } protected Query pq( float boost, String field, String... texts ){ return pq( boost, 0, field, texts ); } protected Query pq( float boost, int slop, String field, String... texts ){ PhraseQuery query = new PhraseQuery(); for( String text : texts ){ query.add( new Term( field, text ) ); } query.setBoost( boost ); query.setSlop( slop ); return query; } protected Query dmq( Query... queries ){ return dmq( 0.0F, queries ); } protected Query dmq( float tieBreakerMultiplier, Query... queries ){ DisjunctionMaxQuery query = new DisjunctionMaxQuery( tieBreakerMultiplier ); for( Query q : queries ){ query.add( q ); } return query; } protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){ assertEquals( expected.length, actual.size() ); for( Query query : expected ){ assertTrue( actual.contains( query ) ); } } static final class BigramAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new BasicNGramTokenizer( reader ); } } static final class BasicNGramTokenizer extends Tokenizer { public static final int DEFAULT_N_SIZE = 2; public static final String DEFAULT_DELIMITERS = " \t\n.,"; private final int n; private final String delimiters; private int startTerm; private int lenTerm; private int startOffset; private int nextStartOffset; private int ch; private String snippet; private StringBuilder snippetBuffer; private static final int BUFFER_SIZE = 4096; private char[] charBuffer; private int charBufferIndex; private int charBufferLen; public BasicNGramTokenizer( Reader in ){ this( in, DEFAULT_N_SIZE ); } public BasicNGramTokenizer( Reader in, int n ){ this( in, n, DEFAULT_DELIMITERS ); } public BasicNGramTokenizer( Reader in, String delimiters ){ this( in, DEFAULT_N_SIZE, delimiters ); } public BasicNGramTokenizer( Reader in, int n, String delimiters ){ super(in); this.n = n; this.delimiters = delimiters; startTerm = 0; nextStartOffset = 0; snippet = null; snippetBuffer = new StringBuilder(); charBuffer = new char[BUFFER_SIZE]; charBufferIndex = BUFFER_SIZE; charBufferLen = 0; ch = 0; } CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @Override public boolean incrementToken() throws IOException { if( !getNextPartialSnippet() ) return false; clearAttributes(); termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm); offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm)); return true; } private int getFinalOffset() { return nextStartOffset; } @Override public final void end(){ offsetAtt.setOffset(getFinalOffset(),getFinalOffset()); } protected boolean getNextPartialSnippet() throws IOException { if( snippet != null && snippet.length() >= startTerm + 1 + n ){ startTerm++; startOffset++; lenTerm = n; return true; } return getNextSnippet(); } protected boolean getNextSnippet() throws IOException { startTerm = 0; startOffset = nextStartOffset; snippetBuffer.delete( 0, snippetBuffer.length() ); while( true ){ if( ch != -1 ) ch = readCharFromBuffer(); if( ch == -1 ) break; else if( !isDelimiter( ch ) ) snippetBuffer.append( (char)ch ); else if( snippetBuffer.length() > 0 ) break; else startOffset++; } if( snippetBuffer.length() == 0 ) return false; snippet = snippetBuffer.toString(); lenTerm = snippet.length() >= n ? n : snippet.length(); return true; } protected int readCharFromBuffer() throws IOException { if( charBufferIndex >= charBufferLen ){ charBufferLen = input.read( charBuffer ); if( charBufferLen == -1 ){ return -1; } charBufferIndex = 0; } int c = charBuffer[charBufferIndex++]; nextStartOffset++; return c; } protected boolean isDelimiter( int c ){ return delimiters.indexOf( c ) >= 0; } @Override public void reset( Reader input ) throws IOException { super.reset( input ); reset(); } @Override public void reset() throws IOException { startTerm = 0; nextStartOffset = 0; snippet = null; snippetBuffer.setLength( 0 ); charBufferIndex = BUFFER_SIZE; charBufferLen = 0; ch = 0; } } protected void make1d1fIndex( String value ) throws Exception { make1dmfIndex( value ); } protected void make1d1fIndexB( String value ) throws Exception { make1dmfIndexB( value ); } protected void make1dmfIndex( String... values ) throws Exception { make1dmfIndex( analyzerW, values ); } protected void make1dmfIndexB( String... values ) throws Exception { make1dmfIndex( analyzerB, values ); } // make 1 doc with multi valued field protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception { IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE)); Document doc = new Document(); for( String value: values ) doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); writer.addDocument( doc ); writer.close(); if (reader != null) reader.close(); reader = IndexReader.open( dir, true ); } // make 1 doc with multi valued & not analyzed field protected void make1dmfIndexNA( String... values ) throws Exception { IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, analyzerK).setOpenMode(OpenMode.CREATE)); Document doc = new Document(); for( String value: values ) doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); writer.addDocument( doc ); writer.close(); if (reader != null) reader.close(); reader = IndexReader.open( dir, true ); } protected void makeIndexShortMV() throws Exception { // "" // "" // 012345 // "a b c" // 0 1 2 // "" // 6789 // "d e" // 3 4 make1dmfIndex( shortMVValues ); } protected void makeIndexLongMV() throws Exception { // 11111111112222222222333333333344444444445555555555666666666677777777778888888888999 // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 // Followings are the examples of customizable parameters and actual examples of customization: // 0 1 2 3 4 5 6 7 8 9 10 11 // 1 2 // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122 // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901 // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically // 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34 make1dmfIndex( longMVValues ); } protected void makeIndexLongMVB() throws Exception { // "*" ... LF // 1111111111222222222233333333334444444444555555 // 01234567890123456789012345678901234567890123456789012345 // *Lucene/Solr does not require such additional hardware. // Lu 0 do 10 re 15 su 21 na 31 // uc 1 oe 11 eq 16 uc 22 al 32 // ce 2 es 12 qu 17 ch 23 ha 33 // en 3 no 13 ui 18 ad 24 ar 34 // ne 4 ot 14 ir 19 dd 25 rd 35 // e/ 5 re 20 di 26 dw 36 // /S 6 it 27 wa 37 // So 7 ti 28 ar 38 // ol 8 io 29 re 39 // lr 9 on 30 // 5555666666666677777777778888888888999999999 // 6789012345678901234567890123456789012345678 // *When you talk about processing speed, the // Wh 40 ab 48 es 56 th 65 // he 41 bo 49 ss 57 he 66 // en 42 ou 50 si 58 // yo 43 ut 51 in 59 // ou 44 pr 52 ng 60 // ta 45 ro 53 sp 61 // al 46 oc 54 pe 62 // lk 47 ce 55 ee 63 // ed 64 make1dmfIndexB( biMVValues ); } protected void makeIndexStrMV() throws Exception { // 0123 // "abc" // 34567 // "defg" // 111 // 789012 // "hijkl" make1dmfIndexNA( strMVValues ); } }