package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.*; import org.apache.lucene.store.*; import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.document.*; import org.apache.lucene.index.FieldInfo.IndexOptions; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.junit.Ignore; // Best to run this test w/ plenty of RAM (because of the // terms index): // // ant compile-test // // java -server -Xmx8g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/test-framework:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=MMapDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms // public class Test2BTerms extends LuceneTestCase { private final class MyTokenStream extends TokenStream { private final int tokensPerDoc; private int tokenCount; private final CharTermAttribute charTerm; private final static int TOKEN_LEN = 5; private final char[] chars; public final List<String> savedTerms = new ArrayList<String>(); private int nextSave; public MyTokenStream(int tokensPerDoc) { super(); this.tokensPerDoc = tokensPerDoc; charTerm = addAttribute(CharTermAttribute.class); chars = charTerm.resizeBuffer(TOKEN_LEN); charTerm.setLength(TOKEN_LEN); nextSave = _TestUtil.nextInt(random, 500000, 1000000); } @Override public boolean incrementToken() { if (tokenCount >= tokensPerDoc) { return false; } _TestUtil.randomFixedLengthUnicodeString(random, chars, 0, TOKEN_LEN); tokenCount++; if (--nextSave == 0) { final String s = new String(chars, 0, TOKEN_LEN); System.out.println("TEST: save term=" + s + " [" + toHexString(s) + "]"); savedTerms.add(s); nextSave = _TestUtil.nextInt(random, 500000, 1000000); } return true; } @Override public void reset() { tokenCount = 0; } } @Ignore("Takes ~4 hours to run on a fast machine!!") public void test2BTerms() throws IOException { final long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000; final int TERMS_PER_DOC = _TestUtil.nextInt(random, 100000, 1000000); List<String> savedTerms = null; MockDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BTerms")); dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER); dir.setCheckIndexOnClose(false); // don't double-checkindex //Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex")); if (true) { IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setRAMBufferSizeMB(256.0) .setMergeScheduler(new ConcurrentMergeScheduler()) .setMergePolicy(newLogMergePolicy(false, 10)) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); MergePolicy mp = w.getConfig().getMergePolicy(); if (mp instanceof LogByteSizeMergePolicy) { // 1 petabyte: ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024); } Document doc = new Document(); final MyTokenStream ts = new MyTokenStream(TERMS_PER_DOC); Field field = new Field("field", ts); field.setIndexOptions(IndexOptions.DOCS_ONLY); field.setOmitNorms(true); doc.add(field); //w.setInfoStream(System.out); final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC); System.out.println("TERMS_PER_DOC=" + TERMS_PER_DOC); System.out.println("numDocs=" + numDocs); for(int i=0;i<numDocs;i++) { final long t0 = System.currentTimeMillis(); w.addDocument(doc); System.out.println(i + " of " + numDocs + " " + (System.currentTimeMillis()-t0) + " msec"); } savedTerms = ts.savedTerms; System.out.println("TEST: optimize"); w.optimize(); System.out.println("TEST: close writer"); w.close(); } System.out.println("TEST: open reader"); final IndexReader r = IndexReader.open(dir); if (savedTerms == null) { savedTerms = findTerms(r); } final int numSavedTerms = savedTerms.size(); final List<String> bigOrdTerms = new ArrayList<String>(savedTerms.subList(numSavedTerms-10, numSavedTerms)); System.out.println("TEST: test big ord terms..."); testSavedTerms(r, bigOrdTerms); System.out.println("TEST: test all saved terms..."); testSavedTerms(r, savedTerms); r.close(); System.out.println("TEST: now CheckIndex..."); CheckIndex.Status status = _TestUtil.checkIndex(dir); final long tc = status.segmentInfos.get(0).termIndexStatus.termCount; assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE); dir.close(); } private List<String> findTerms(IndexReader r) throws IOException { System.out.println("TEST: findTerms"); final TermEnum termEnum = r.terms(); final List<String> savedTerms = new ArrayList<String>(); int nextSave = _TestUtil.nextInt(random, 500000, 1000000); while(termEnum.next()) { if (--nextSave == 0) { savedTerms.add(termEnum.term().text()); System.out.println("TEST: add " + termEnum.term()); nextSave = _TestUtil.nextInt(random, 500000, 1000000); } } return savedTerms; } private String toHexString(String s) { byte[] bytes; try { bytes = s.getBytes("UTF-8"); } catch (UnsupportedEncodingException uee) { throw new RuntimeException(uee); } StringBuilder sb = new StringBuilder(); for(byte b : bytes) { if (sb.length() > 0) { sb.append(' '); } sb.append(Integer.toHexString(b&0xFF)); } return sb.toString(); } private void testSavedTerms(IndexReader r, List<String> terms) throws IOException { System.out.println("TEST: run " + terms.size() + " terms on reader=" + r); IndexSearcher s = new IndexSearcher(r); Collections.shuffle(terms); boolean failed = false; for(int iter=0;iter<10*terms.size();iter++) { final String term = terms.get(random.nextInt(terms.size())); System.out.println("TEST: search " + term + " [" + toHexString(term) + "]"); final long t0 = System.currentTimeMillis(); final int count = s.search(new TermQuery(new Term("field", term)), 1).totalHits; if (count <= 0) { System.out.println(" FAILED: count=" + count); failed = true; } final long t1 = System.currentTimeMillis(); System.out.println(" took " + (t1-t0) + " millis"); final TermEnum termEnum = r.terms(new Term("field", term)); final String text = termEnum.term().text(); if (!term.equals(text)) { System.out.println(" FAILED: wrong term: got " + text + " [" + toHexString(text) + "]"); failed = true; } } assertFalse(failed); } }