package org.apache.lucene.index.memory; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.HitCollector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.index.TermDocs; /** Verifies that Lucene MemoryIndex and RAMDirectory have the same behaviour, returning the same results for any given query. Runs a set of queries against a set of files and compares results for identity. Can also be used as a simple benchmark. <p> Example usage: <pre> cd lucene-svn java -server -cp ~/unix/java/share/misc/junit/junit.jar:build/classes:build/lucene-core-2.1-dev.jar:build/contrib/memory/classes/test:build/contrib/memory/classes/java org.apache.lucene.index.memory.MemoryIndexTest 1 1 memram @contrib/memory/src/test/org/apache/lucene/index/memory/testqueries.txt *.txt *.html *.xml xdocs/*.xml src/test/org/apache/lucene/queryParser/*.java contrib/memory/src/java/org/apache/lucene/index/memory/*.java </pre> where testqueries.txt is a file with one query per line, such as: <pre> # # queries extracted from TestQueryParser.java # Apache Apach~ AND Copy* a AND b (a AND b) c OR (a AND b) a AND NOT b a AND -b a AND !b a && b a && ! b a OR b a || b a OR !b a OR ! b a OR -b +term -term term foo:term AND field:anotherTerm term AND "phrase phrase" "hello there" germ term^2.0 (term)^2.0 (germ term)^2.0 term^2.0 term^2 "germ term"^2.0 "term germ"^2 (foo OR bar) AND (baz OR boo) ((a OR b) AND NOT c) OR d +(apple "steve jobs") -(foo bar baz) +title:(dog OR cat) -author:"bob dole" a&b a&&b .NET "term germ"~2 "term germ"~2 flork "term"~2 "~2 germ" "term germ"~2^2 3 term 1.0 1 2 term term1 term2 term* term*^2 term~ term~0.7 term~^2 term^2~ term*germ term*germ^3 term* Term* TERM* term* Term* TERM* // Then 'full' wildcard queries: te?m Te?m TE?M Te?m*gerM te?m Te?m TE?M Te?m*gerM term term term term +stop term term -stop term drop AND stop AND roll term phrase term term AND NOT phrase term stop [ a TO c] [ a TO c ] { a TO c} { a TO c } { a TO c }^2.0 [ a TO c] OR bar [ a TO c] AND bar ( bar blar { a TO c}) gack ( bar blar { a TO c}) +weltbank +worlbank +weltbank\n+worlbank weltbank \n+worlbank weltbank \n +worlbank +weltbank\r+worlbank weltbank \r+worlbank weltbank \r +worlbank +weltbank\r\n+worlbank weltbank \r\n+worlbank weltbank \r\n +worlbank weltbank \r \n +worlbank +weltbank\t+worlbank weltbank \t+worlbank weltbank \t +worlbank term term term term +term term term term +term term +term +term -term term term on^1.0 "hello"^2.0 hello^2.0 "on"^1.0 the^3 </pre> */ public class MemoryIndexTest extends BaseTokenStreamTestCase { private Analyzer analyzer; private boolean fastMode = false; private final boolean verbose = false; private static final String FIELD_NAME = "content"; /** Runs the tests and/or benchmark */ public static void main(String[] args) throws Throwable { new MemoryIndexTest().run(args); } /* all files will be open relative to this */ public String fileDir; protected void setUp() throws Exception { super.setUp(); fileDir = System.getProperty("lucene.common.dir", null); } // public void tearDown() {} public void testMany() throws Throwable { String[] files = listFiles(new String[] { "*.txt", "*.html", "*.xml", "xdocs/*.xml", "src/java/test/org/apache/lucene/queryParser/*.java", "contrib/memory/src/java/org/apache/lucene/index/memory/*.java", }); System.out.println("files = " + java.util.Arrays.asList(files)); String[] xargs = new String[] { "1", "1", "memram", "@contrib/memory/src/test/org/apache/lucene/index/memory/testqueries.txt", }; String[] args = new String[xargs.length + files.length]; System.arraycopy(xargs, 0, args, 0, xargs.length); System.arraycopy(files, 0, args, xargs.length, files.length); run(args); } private void run(String[] args) throws Throwable { int k = -1; int iters = 1; if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k])); int runs = 1; if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k])); String cmd = "memram"; if (args.length > ++k) cmd = args[k]; boolean useMemIndex = cmd.indexOf("mem") >= 0; boolean useRAMIndex = cmd.indexOf("ram") >= 0; String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" }; if (args.length > ++k) { String arg = args[k]; if (arg.startsWith("@")) queries = readLines(new File(fileDir, arg.substring(1))); else queries = new String[] { arg }; } File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") }; if (args.length > ++k) { files = new File[args.length - k]; for (int i=k; i < args.length; i++) { files[i-k] = new File(args[i]); } } boolean toLowerCase = true; // boolean toLowerCase = false; // Set stopWords = null; Set stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; Analyzer[] analyzers = new Analyzer[] { new SimpleAnalyzer(), new StopAnalyzer(), new StandardAnalyzer(), PatternAnalyzer.DEFAULT_ANALYZER, // new WhitespaceAnalyzer(), // new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null), // new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords), // new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS), }; boolean first = true; for (int iter=0; iter < iters; iter++) { System.out.println("\n########### iteration=" + iter); long start = System.currentTimeMillis(); long bytes = 0; for (int anal=0; anal < analyzers.length; anal++) { this.analyzer = analyzers[anal]; for (int i=0; i < files.length; i++) { File file = files[i]; if (!file.exists() || file.isDirectory()) continue; // ignore bytes += file.length(); String text = toString(new FileInputStream(file), null); Document doc = createDocument(text); if (verbose) System.out.println("\n*********** FILE=" + file); for (int q=0; q < queries.length; q++) { try { Query query = parseQuery(queries[q]); boolean measureIndexing = false; // toggle this to measure query performance MemoryIndex memind = null; if (useMemIndex && !measureIndexing) memind = createMemoryIndex(doc); if (first) { IndexSearcher s = memind.createSearcher(); TermDocs td = s.getIndexReader().termDocs(null); assertTrue(td.next()); assertEquals(0, td.doc()); assertEquals(1, td.freq()); td.close(); s.close(); first = false; } RAMDirectory ramind = null; if (useRAMIndex && !measureIndexing) ramind = createRAMIndex(doc); for (int run=0; run < runs; run++) { float score1 = 0.0f; float score2 = 0.0f; if (useMemIndex && measureIndexing) memind = createMemoryIndex(doc); if (useMemIndex) score1 = query(memind, query); if (useRAMIndex && measureIndexing) ramind = createRAMIndex(doc); if (useRAMIndex) score2 = query(ramind, query); if (useMemIndex && useRAMIndex) { if (verbose) System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2); if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) { throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer); } } } } catch (Throwable t) { if (t instanceof OutOfMemoryError) t.printStackTrace(); System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer); throw t; } } } } long end = System.currentTimeMillis(); System.out.println("\nsecs = " + ((end-start)/1000.0f)); System.out.println("queries/sec= " + (1.0f * runs * queries.length * analyzers.length * files.length / ((end-start)/1000.0f))); float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f); System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f))); } if (useMemIndex && useRAMIndex) System.out.println("No bug found. done."); else System.out.println("Done benchmarking (without checking correctness)."); } // returns file line by line, ignoring empty lines and comments private String[] readLines(File file) throws Exception { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(file))); List lines = new ArrayList(); String line; while ((line = reader.readLine()) != null) { String t = line.trim(); if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) { lines.add(line); } } reader.close(); String[] result = new String[lines.size()]; lines.toArray(result); return result; } private Document createDocument(String content) { Document doc = new Document(); doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS)); return doc; } private MemoryIndex createMemoryIndex(Document doc) { MemoryIndex index = new MemoryIndex(); Iterator iter = doc.getFields().iterator(); while (iter.hasNext()) { Field field = (Field) iter.next(); index.addField(field.name(), field.stringValue(), analyzer); } return index; } private RAMDirectory createRAMIndex(Document doc) { RAMDirectory dir = new RAMDirectory(); IndexWriter writer = null; try { writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); writer.addDocument(doc); writer.optimize(); return dir; } catch (IOException e) { // should never happen (RAMDirectory) throw new RuntimeException(e); } finally { try { if (writer != null) writer.close(); } catch (IOException e) { // should never happen (RAMDirectory) throw new RuntimeException(e); } } } private float query(Object index, Query query) { // System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f))); Searcher searcher = null; try { if (index instanceof Directory) searcher = new IndexSearcher((Directory)index); else searcher = ((MemoryIndex) index).createSearcher(); final float[] scores = new float[1]; // inits to 0.0f searcher.search(query, new HitCollector() { public void collect(int doc, float score) { scores[0] = score; } }); float score = scores[0]; // Hits hits = searcher.search(query); // float score = hits.length() > 0 ? hits.score(0) : 0.0f; return score; } catch (IOException e) { // should never happen (RAMDirectory) throw new RuntimeException(e); } finally { try { if (searcher != null) searcher.close(); } catch (IOException e) { // should never happen (RAMDirectory) throw new RuntimeException(e); } } } private int getMemorySize(Object index) { if (index instanceof Directory) { try { Directory dir = (Directory) index; int size = 0; String[] fileNames = dir.list(); for (int i=0; i < fileNames.length; i++) { size += dir.fileLength(fileNames[i]); } return size; } catch (IOException e) { // can never happen (RAMDirectory) throw new RuntimeException(e); } } else { return ((MemoryIndex) index).getMemorySize(); } } private Query parseQuery(String expression) throws ParseException { QueryParser parser = new QueryParser(FIELD_NAME, analyzer); // parser.setPhraseSlop(0); return parser.parse(expression); } /** returns all files matching the given file name patterns (quick n'dirty) */ static String[] listFiles(String[] fileNames) { LinkedHashSet allFiles = new LinkedHashSet(); for (int i=0; i < fileNames.length; i++) { int k; if ((k = fileNames[i].indexOf("*")) < 0) { allFiles.add(fileNames[i]); } else { String prefix = fileNames[i].substring(0, k); if (prefix.length() == 0) prefix = "."; final String suffix = fileNames[i].substring(k+1); File[] files = new File(prefix).listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(suffix); } }); if (files != null) { for (int j=0; j < files.length; j++) { allFiles.add(files[j].getPath()); } } } } String[] result = new String[allFiles.size()]; allFiles.toArray(result); return result; } // trick to detect default platform charset private static final Charset DEFAULT_PLATFORM_CHARSET = Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding()); // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux private static String toString(InputStream input, Charset charset) throws IOException { if (charset == null) charset = DEFAULT_PLATFORM_CHARSET; byte[] data = toByteArray(input); return charset.decode(ByteBuffer.wrap(data)).toString(); } private static byte[] toByteArray(InputStream input) throws IOException { try { // safe and fast even if input.available() behaves weird or buggy int len = Math.max(256, input.available()); byte[] buffer = new byte[len]; byte[] output = new byte[len]; len = 0; int n; while ((n = input.read(buffer)) >= 0) { if (len + n > output.length) { // grow capacity byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; System.arraycopy(output, 0, tmp, 0, len); System.arraycopy(buffer, 0, tmp, len, n); buffer = output; // use larger buffer for future larger bulk reads output = tmp; } else { System.arraycopy(buffer, 0, output, len, n); } len += n; } if (len == output.length) return output; buffer = null; // help gc buffer = new byte[len]; System.arraycopy(output, 0, buffer, 0, len); return buffer; } finally { if (input != null) input.close(); } } }