package org.apache.lucene; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.*; import org.apache.lucene.index.*; import org.apache.lucene.document.*; import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.index.codecs.*; import org.apache.lucene.index.codecs.standard.*; import org.apache.lucene.index.codecs.pulsing.*; import org.apache.lucene.store.*; import java.util.*; import java.io.*; /* Intentionally outside of oal.index to verify fully external codecs work fine */ public class TestExternalCodecs extends LuceneTestCase { // For fun, test that we can override how terms are // sorted, and basic things still work -- this comparator // sorts in reversed unicode code point order: private static final Comparator<BytesRef> reverseUnicodeComparator = new Comparator<BytesRef>() { public int compare(BytesRef t1, BytesRef t2) { byte[] b1 = t1.bytes; byte[] b2 = t2.bytes; int b1Stop; int b1Upto = t1.offset; int b2Upto = t2.offset; if (t1.length < t2.length) { b1Stop = t1.offset + t1.length; } else { b1Stop = t1.offset + t2.length; } while(b1Upto < b1Stop) { final int bb1 = b1[b1Upto++] & 0xff; final int bb2 = b2[b2Upto++] & 0xff; if (bb1 != bb2) { //System.out.println("cmp 1=" + t1 + " 2=" + t2 + " return " + (bb2-bb1)); return bb2 - bb1; } } // One is prefix of another, or they are equal return t2.length-t1.length; } public boolean equals(Object other) { return this == other; } }; // TODO // - good improvement would be to write through to disk, // and then load into ram from disk public static class RAMOnlyCodec extends Codec { // Postings state: static class RAMPostings extends FieldsProducer { final Map<String,RAMField> fieldToTerms = new TreeMap<String,RAMField>(); @Override public Terms terms(String field) { return fieldToTerms.get(field); } @Override public FieldsEnum iterator() { return new RAMFieldsEnum(this); } @Override public void close() { } @Override public void loadTermsIndex(int indexDivisor) { } } static class RAMField extends Terms { final String field; final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>(); RAMField(String field) { this.field = field; } @Override public long getUniqueTermCount() { return termToDocs.size(); } @Override public TermsEnum iterator() { return new RAMTermsEnum(RAMOnlyCodec.RAMField.this); } @Override public Comparator<BytesRef> getComparator() { return reverseUnicodeComparator; } } static class RAMTerm { final String term; final List<RAMDoc> docs = new ArrayList<RAMDoc>(); public RAMTerm(String term) { this.term = term; } } static class RAMDoc { final int docID; final int[] positions; public RAMDoc(int docID, int freq) { this.docID = docID; positions = new int[freq]; } } // Classes for writing to the postings state private static class RAMFieldsConsumer extends FieldsConsumer { private final RAMPostings postings; private final RAMTermsConsumer termsConsumer = new RAMTermsConsumer(); public RAMFieldsConsumer(RAMPostings postings) { this.postings = postings; } @Override public TermsConsumer addField(FieldInfo field) { RAMField ramField = new RAMField(field.name); postings.fieldToTerms.put(field.name, ramField); termsConsumer.reset(ramField); return termsConsumer; } @Override public void close() { // TODO: finalize stuff } } private static class RAMTermsConsumer extends TermsConsumer { private RAMField field; private final RAMPostingsWriterImpl postingsWriter = new RAMPostingsWriterImpl(); RAMTerm current; void reset(RAMField field) { this.field = field; } @Override public PostingsConsumer startTerm(BytesRef text) { final String term = text.utf8ToString(); current = new RAMTerm(term); postingsWriter.reset(current); return postingsWriter; } @Override public Comparator<BytesRef> getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override public void finishTerm(BytesRef text, int numDocs) { assert numDocs > 0; assert numDocs == current.docs.size(); field.termToDocs.put(current.term, current); } @Override public void finish() { } } public static class RAMPostingsWriterImpl extends PostingsConsumer { private RAMTerm term; private RAMDoc current; private int posUpto = 0; public void reset(RAMTerm term) { this.term = term; } @Override public void startDoc(int docID, int freq) { current = new RAMDoc(docID, freq); term.docs.add(current); posUpto = 0; } @Override public void addPosition(int position, BytesRef payload) { if (payload != null) { throw new UnsupportedOperationException("can't handle payloads"); } current.positions[posUpto++] = position; } @Override public void finishDoc() { assert posUpto == current.positions.length; } } // Classes for reading from the postings state static class RAMFieldsEnum extends FieldsEnum { private final RAMPostings postings; private final Iterator<String> it; private String current; public RAMFieldsEnum(RAMPostings postings) { this.postings = postings; this.it = postings.fieldToTerms.keySet().iterator(); } @Override public String next() { if (it.hasNext()) { current = it.next(); } else { current = null; } return current; } @Override public TermsEnum terms() { return new RAMTermsEnum(postings.fieldToTerms.get(current)); } } static class RAMTermsEnum extends TermsEnum { Iterator<String> it; String current; private final RAMField ramField; public RAMTermsEnum(RAMField field) { this.ramField = field; } @Override public Comparator<BytesRef> getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override public BytesRef next() { if (it == null) { if (current == null) { it = ramField.termToDocs.keySet().iterator(); } else { it = ramField.termToDocs.tailMap(current).keySet().iterator(); } } if (it.hasNext()) { current = it.next(); return new BytesRef(current); } else { return null; } } @Override public SeekStatus seek(BytesRef term, boolean useCache) { current = term.utf8ToString(); it = null; if (ramField.termToDocs.containsKey(current)) { return SeekStatus.FOUND; } else { if (current.compareTo(ramField.termToDocs.lastKey()) > 0) { return SeekStatus.END; } else { return SeekStatus.NOT_FOUND; } } } @Override public SeekStatus seek(long ord) { throw new UnsupportedOperationException(); } @Override public long ord() { throw new UnsupportedOperationException(); } @Override public BytesRef term() { // TODO: reuse BytesRef return new BytesRef(current); } @Override public int docFreq() { return ramField.termToDocs.get(current).docs.size(); } @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs); } @Override public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) { return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), skipDocs); } } private static class RAMDocsEnum extends DocsEnum { private final RAMTerm ramTerm; private final Bits skipDocs; private RAMDoc current; int upto = -1; int posUpto = 0; public RAMDocsEnum(RAMTerm ramTerm, Bits skipDocs) { this.ramTerm = ramTerm; this.skipDocs = skipDocs; } @Override public int advance(int targetDocID) { do { nextDoc(); } while (upto < ramTerm.docs.size() && current.docID < targetDocID); return NO_MORE_DOCS; } // TODO: override bulk read, for better perf @Override public int nextDoc() { while(true) { upto++; if (upto < ramTerm.docs.size()) { current = ramTerm.docs.get(upto); if (skipDocs == null || !skipDocs.get(current.docID)) { posUpto = 0; return current.docID; } } else { return NO_MORE_DOCS; } } } @Override public int freq() { return current.positions.length; } @Override public int docID() { return current.docID; } } private static class RAMDocsAndPositionsEnum extends DocsAndPositionsEnum { private final RAMTerm ramTerm; private final Bits skipDocs; private RAMDoc current; int upto = -1; int posUpto = 0; public RAMDocsAndPositionsEnum(RAMTerm ramTerm, Bits skipDocs) { this.ramTerm = ramTerm; this.skipDocs = skipDocs; } @Override public int advance(int targetDocID) { do { nextDoc(); } while (upto < ramTerm.docs.size() && current.docID < targetDocID); return NO_MORE_DOCS; } // TODO: override bulk read, for better perf @Override public int nextDoc() { while(true) { upto++; if (upto < ramTerm.docs.size()) { current = ramTerm.docs.get(upto); if (skipDocs == null || !skipDocs.get(current.docID)) { posUpto = 0; return current.docID; } } else { return NO_MORE_DOCS; } } } @Override public int freq() { return current.positions.length; } @Override public int docID() { return current.docID; } @Override public int nextPosition() { return current.positions[posUpto++]; } @Override public boolean hasPayload() { return false; } @Override public BytesRef getPayload() { return null; } } // Holds all indexes created private final Map<String,RAMPostings> state = new HashMap<String,RAMPostings>(); @Override public FieldsConsumer fieldsConsumer(SegmentWriteState writeState) { RAMPostings postings = new RAMPostings(); RAMFieldsConsumer consumer = new RAMFieldsConsumer(postings); synchronized(state) { state.put(writeState.segmentName, postings); } return consumer; } @Override public FieldsProducer fieldsProducer(SegmentReadState readState) throws IOException { return state.get(readState.segmentInfo.name); } @Override public void getExtensions(Set<String> extensions) { } @Override public void files(Directory dir, SegmentInfo segmentInfo, Set<String> files) { } } public static class MyCodecs extends CodecProvider { PerFieldCodecWrapper perField; MyCodecs() { Codec ram = new RAMOnlyCodec(); Codec pulsing = new PulsingReverseTermsCodec(); perField = new PerFieldCodecWrapper(ram); perField.add("field2", pulsing); perField.add("id", pulsing); register(perField); } @Override public Codec getWriter(SegmentWriteState state) { return perField; } } // copied from PulsingCodec, just changing the terms // comparator private static class PulsingReverseTermsCodec extends Codec { public PulsingReverseTermsCodec() { name = "PulsingReverseTerms"; } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { StandardPostingsWriter docsWriter = new StandardPostingsWriterImpl(state); // Terms that have <= freqCutoff number of docs are // "pulsed" (inlined): final int freqCutoff = 1; StandardPostingsWriter pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter); // Terms dict index StandardTermsIndexWriter indexWriter; boolean success = false; try { indexWriter = new SimpleStandardTermsIndexWriter(state); success = true; } finally { if (!success) { pulsingWriter.close(); } } // Terms dict success = false; try { FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator); success = true; return ret; } finally { if (!success) { try { pulsingWriter.close(); } finally { indexWriter.close(); } } } } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { StandardPostingsReader docsReader = new StandardPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize); StandardPostingsReader pulsingReader = new PulsingPostingsReaderImpl(docsReader); // Terms dict index reader StandardTermsIndexReader indexReader; boolean success = false; try { indexReader = new SimpleStandardTermsIndexReader(state.dir, state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, reverseUnicodeComparator); success = true; } finally { if (!success) { pulsingReader.close(); } } // Terms dict reader success = false; try { FieldsProducer ret = new StandardTermsDictReader(indexReader, state.dir, state.fieldInfos, state.segmentInfo.name, pulsingReader, state.readBufferSize, reverseUnicodeComparator, StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; } finally { if (!success) { try { pulsingReader.close(); } finally { indexReader.close(); } } } } @Override public void files(Directory dir, SegmentInfo segmentInfo, Set<String> files) throws IOException { StandardPostingsReaderImpl.files(dir, segmentInfo, files); StandardTermsDictReader.files(dir, segmentInfo, files); SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); } @Override public void getExtensions(Set<String> extensions) { StandardCodec.getStandardExtensions(extensions); } } // tests storing "id" and "field2" fields as pulsing codec, // whose term sort is backwards unicode code point, and // storing "field1" as a custom entirely-in-RAM codec public void testPerFieldCodec() throws Exception { final int NUM_DOCS = 173; Random random = newRandom(); Directory dir = newDirectory(random); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(new MyCodecs())); w.setMergeFactor(3); Document doc = new Document(); // uses default codec: doc.add(new Field("field1", "this field uses the standard codec as the test", Field.Store.NO, Field.Index.ANALYZED)); // uses pulsing codec: doc.add(new Field("field2", "this field uses the pulsing codec as the test", Field.Store.NO, Field.Index.ANALYZED)); Field idField = new Field("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED); doc.add(idField); for(int i=0;i<NUM_DOCS;i++) { idField.setValue(""+i); w.addDocument(doc); if ((i+1)%10 == 0) { w.commit(); } } w.deleteDocuments(new Term("id", "77")); IndexReader r = w.getReader(); IndexReader[] subs = r.getSequentialSubReaders(); assertTrue(subs.length > 1); // test each segment for(int i=0;i<subs.length;i++) { //System.out.println("test i=" + i); testTermsOrder(subs[i]); } // test each multi-reader testTermsOrder(r); assertEquals(NUM_DOCS-1, r.numDocs()); IndexSearcher s = new IndexSearcher(r); assertEquals(NUM_DOCS-1, s.search(new TermQuery(new Term("field1", "standard")), 1).totalHits); assertEquals(NUM_DOCS-1, s.search(new TermQuery(new Term("field2", "pulsing")), 1).totalHits); r.close(); s.close(); w.deleteDocuments(new Term("id", "44")); w.optimize(); r = w.getReader(); assertEquals(NUM_DOCS-2, r.maxDoc()); assertEquals(NUM_DOCS-2, r.numDocs()); s = new IndexSearcher(r); assertEquals(NUM_DOCS-2, s.search(new TermQuery(new Term("field1", "standard")), 1).totalHits); assertEquals(NUM_DOCS-2, s.search(new TermQuery(new Term("field2", "pulsing")), 1).totalHits); assertEquals(1, s.search(new TermQuery(new Term("id", "76")), 1).totalHits); assertEquals(0, s.search(new TermQuery(new Term("id", "77")), 1).totalHits); assertEquals(0, s.search(new TermQuery(new Term("id", "44")), 1).totalHits); testTermsOrder(r); r.close(); s.close(); w.close(); dir.close(); } private void testTermsOrder(IndexReader r) throws Exception { // Verify sort order matches what my comparator said: BytesRef lastBytesRef = null; TermsEnum terms = MultiFields.getFields(r).terms("id").iterator(); //System.out.println("id terms:"); while(true) { BytesRef t = terms.next(); if (t == null) { break; } //System.out.println(" " + t); if (lastBytesRef == null) { lastBytesRef = new BytesRef(t); } else { assertTrue("terms in wrong order last=" + lastBytesRef.utf8ToString() + " current=" + t.utf8ToString(), reverseUnicodeComparator.compare(lastBytesRef, t) < 0); lastBytesRef.copy(t); } } } }