TestPayloads.java example

Explorer
solrcene-master
package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;


public class TestPayloads extends LuceneTestCase {
    
    // Simple tests to test the Payload class
    public void testPayload() throws Exception {
        rnd = newRandom();
        byte[] testData = "This is a test!".getBytes();
        Payload payload = new Payload(testData);
        assertEquals("Wrong payload length.", testData.length, payload.length());
        
        // test copyTo()
        byte[] target = new byte[testData.length - 1];
        try {
            payload.copyTo(target, 0);
            fail("Expected exception not thrown");
        } catch (Exception expected) {
            // expected exception
        }
        
        target = new byte[testData.length + 3];
        payload.copyTo(target, 3);
        
        for (int i = 0; i < testData.length; i++) {
            assertEquals(testData[i], target[i + 3]);
        }
        

        // test toByteArray()
        target = payload.toByteArray();
        assertByteArrayEquals(testData, target);

        // test byteAt()
        for (int i = 0; i < testData.length; i++) {
            assertEquals(payload.byteAt(i), testData[i]);
        }
        
        try {
            payload.byteAt(testData.length + 1);
            fail("Expected exception not thrown");
        } catch (Exception expected) {
            // expected exception
        }
        
        Payload clone = (Payload) payload.clone();
        assertEquals(payload.length(), clone.length());
        for (int i = 0; i < payload.length(); i++) {
          assertEquals(payload.byteAt(i), clone.byteAt(i));
        }
        
    }

    // Tests whether the DocumentWriter and SegmentMerger correctly enable the
    // payload bit in the FieldInfo
    public void testPayloadFieldBit() throws Exception {
        rnd = newRandom();
        Directory ram = newDirectory(rnd);
        PayloadAnalyzer analyzer = new PayloadAnalyzer();
        IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(rnd, TEST_VERSION_CURRENT, analyzer));
        Document d = new Document();
        // this field won't have any payloads
        d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
        // this field will have payloads in all docs, however not for all term positions,
        // so this field is used to check if the DocumentWriter correctly enables the payloads bit
        // even if only some term positions have payloads
        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
        // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads 
        // enabled in only some documents
        d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
        // only add payload data for field f2
        analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
        writer.addDocument(d);
        // flush
        writer.close();        
        
        SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
        FieldInfos fi = reader.fieldInfos();
        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
        assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
        reader.close();
        
        // now we add another document which has payloads for field f3 and verify if the SegmentMerger
        // enabled payloads for that field
        writer = new IndexWriter(ram, newIndexWriterConfig(rnd, TEST_VERSION_CURRENT,
            analyzer).setOpenMode(OpenMode.CREATE));
        d = new Document();
        d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
        d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
        // add payload data for field f2 and f3
        analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
        analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
        writer.addDocument(d);

        // force merge
        writer.optimize();
        // flush
        writer.close();

        reader = SegmentReader.getOnlySegmentReader(ram);
        fi = reader.fieldInfos();
        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
        assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
        assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
        reader.close();
        ram.close();
    }

    // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
    public void testPayloadsEncoding() throws Exception {
        rnd = newRandom();
        // first perform the test using a RAMDirectory
        Directory dir = newDirectory(rnd);
        performTest(rnd, dir);
        dir.close();
        // now use a FSDirectory and repeat same test
        File dirName = _TestUtil.getTempDir("test_payloads");
        dir = FSDirectory.open(dirName);
        performTest(rnd, dir);
       _TestUtil.rmDir(dirName);
        dir.close();
    }
    
    // builds an index with payloads in the given Directory and performs
    // different tests to verify the payload encoding
    private void performTest(Random random, Directory dir) throws Exception {
        PayloadAnalyzer analyzer = new PayloadAnalyzer();
        IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random,
            TEST_VERSION_CURRENT, analyzer)
            .setOpenMode(OpenMode.CREATE));
        
        // should be in sync with value in TermInfosWriter
        final int skipInterval = 16;
        
        final int numTerms = 5;
        final String fieldName = "f1";
        
        int numDocs = skipInterval + 1; 
        // create content for the test documents with just a few terms
        Term[] terms = generateTerms(fieldName, numTerms);
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < terms.length; i++) {
            sb.append(terms[i].text());
            sb.append(" ");
        }
        String content = sb.toString();
        
        
        int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
        byte[] payloadData = generateRandomData(payloadDataLength);
        
        Document d = new Document();
        d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
        // add the same document multiple times to have the same payload lengths for all
        // occurrences within two consecutive skip intervals
        int offset = 0;
        for (int i = 0; i < 2 * numDocs; i++) {
            analyzer.setPayloadData(fieldName, payloadData, offset, 1);
            offset += numTerms;
            writer.addDocument(d);
        }
        
        // make sure we create more than one segment to test merging
        writer.commit();
        
        // now we make sure to have different payload lengths next at the next skip point        
        for (int i = 0; i < numDocs; i++) {
            analyzer.setPayloadData(fieldName, payloadData, offset, i);
            offset += i * numTerms;
            writer.addDocument(d);
        }
        
        writer.optimize();
        // flush
        writer.close();
        
        
        /*
         * Verify the index
         * first we test if all payloads are stored correctly
         */        
        IndexReader reader = IndexReader.open(dir, true);

        byte[] verifyPayloadData = new byte[payloadDataLength];
        offset = 0;
        DocsAndPositionsEnum[] tps = new DocsAndPositionsEnum[numTerms];
        for (int i = 0; i < numTerms; i++) {
          tps[i] = MultiFields.getTermPositionsEnum(reader,
                                                    MultiFields.getDeletedDocs(reader),
                                                    terms[i].field(),
                                                    new BytesRef(terms[i].text()));
        }
        
        while (tps[0].nextDoc() != DocsEnum.NO_MORE_DOCS) {
            for (int i = 1; i < numTerms; i++) {
                tps[i].nextDoc();
            }
            int freq = tps[0].freq();

            for (int i = 0; i < freq; i++) {
                for (int j = 0; j < numTerms; j++) {
                    tps[j].nextPosition();
                    BytesRef br = tps[j].getPayload();
                    System.arraycopy(br.bytes, br.offset, verifyPayloadData, offset, br.length);
                    offset += br.length;
                }
            }
        }
        
        assertByteArrayEquals(payloadData, verifyPayloadData);
        
        /*
         *  test lazy skipping
         */        
        DocsAndPositionsEnum tp = MultiFields.getTermPositionsEnum(reader,
                                                                   MultiFields.getDeletedDocs(reader),
                                                                   terms[0].field(),
                                                                   new BytesRef(terms[0].text()));
        tp.nextDoc();
        tp.nextPosition();
        // NOTE: prior rev of this test was failing to first
        // call next here:
        tp.nextDoc();
        // now we don't read this payload
        tp.nextPosition();
        BytesRef payload = tp.getPayload();
        assertEquals("Wrong payload length.", 1, payload.length);
        assertEquals(payload.bytes[payload.offset], payloadData[numTerms]);
        tp.nextDoc();
        tp.nextPosition();
        
        // we don't read this payload and skip to a different document
        tp.advance(5);
        tp.nextPosition();
        payload = tp.getPayload();
        assertEquals("Wrong payload length.", 1, payload.length);
        assertEquals(payload.bytes[payload.offset], payloadData[5 * numTerms]);
                
        
        /*
         * Test different lengths at skip points
         */
        tp = MultiFields.getTermPositionsEnum(reader,
                                              MultiFields.getDeletedDocs(reader),
                                              terms[1].field(),
                                              new BytesRef(terms[1].text()));
        tp.nextDoc();
        tp.nextPosition();
        assertEquals("Wrong payload length.", 1, tp.getPayload().length);
        tp.advance(skipInterval - 1);
        tp.nextPosition();
        assertEquals("Wrong payload length.", 1, tp.getPayload().length);
        tp.advance(2 * skipInterval - 1);
        tp.nextPosition();
        assertEquals("Wrong payload length.", 1, tp.getPayload().length);
        tp.advance(3 * skipInterval - 1);
        tp.nextPosition();
        assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayload().length);
        
        /*
         * Test multiple call of getPayload()
         */
        assertFalse(tp.hasPayload());
        
        reader.close();
        
        // test long payload
        analyzer = new PayloadAnalyzer();
        writer = new IndexWriter(dir, newIndexWriterConfig(random, TEST_VERSION_CURRENT,
            analyzer).setOpenMode(OpenMode.CREATE));
        String singleTerm = "lucene";
        
        d = new Document();
        d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
        // add a payload whose length is greater than the buffer size of BufferedIndexOutput
        payloadData = generateRandomData(2000);
        analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
        writer.addDocument(d);

        
        writer.optimize();
        // flush
        writer.close();
        
        reader = IndexReader.open(dir, true);
        tp = MultiFields.getTermPositionsEnum(reader,
                                              MultiFields.getDeletedDocs(reader),
                                              fieldName,
                                              new BytesRef(singleTerm));
        tp.nextDoc();
        tp.nextPosition();
        
        BytesRef br = tp.getPayload();
        verifyPayloadData = new byte[br.length];
        byte[] portion = new byte[1500];
        System.arraycopy(payloadData, 100, portion, 0, 1500);
        
        assertByteArrayEquals(portion, br.bytes, br.offset, br.length);
        reader.close();
        
    }
    
    private Random rnd;
    
    private void generateRandomData(byte[] data) {
        rnd.nextBytes(data);
    }

    private byte[] generateRandomData(int n) {
        byte[] data = new byte[n];
        generateRandomData(data);
        return data;
    }
    
    private Term[] generateTerms(String fieldName, int n) {
        int maxDigits = (int) (Math.log(n) / Math.log(10));
        Term[] terms = new Term[n];
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < n; i++) {
            sb.setLength(0);
            sb.append("t");
            int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
            for (int j = 0; j < zeros; j++) {
                sb.append("0");
            }
            sb.append(i);
            terms[i] = new Term(fieldName, sb.toString());
        }
        return terms;
    }


    void assertByteArrayEquals(byte[] b1, byte[] b2) {
        if (b1.length != b2.length) {
          fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
        }
        
        for (int i = 0; i < b1.length; i++) {
          if (b1[i] != b2[i]) {
            fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
          }
        }
      }    
    
  void assertByteArrayEquals(byte[] b1, byte[] b2, int b2offset, int b2length) {
        if (b1.length != b2length) {
          fail("Byte arrays have different lengths: " + b1.length + ", " + b2length);
        }
        
        for (int i = 0; i < b1.length; i++) {
          if (b1[i] != b2[b2offset+i]) {
            fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[b2offset+i]);
          }
        }
      }    
    
    
    /**
     * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
     */
    private static class PayloadAnalyzer extends Analyzer {
        Map<String,PayloadData> fieldToData = new HashMap<String,PayloadData>();
        
        void setPayloadData(String field, byte[] data, int offset, int length) {
            fieldToData.put(field, new PayloadData(0, data, offset, length));
        }

        void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
            fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
        }
        
        @Override
        public TokenStream tokenStream(String fieldName, Reader reader) {
            PayloadData payload =  fieldToData.get(fieldName);
            TokenStream ts = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            if (payload != null) {
                if (payload.numFieldInstancesToSkip == 0) {
                    ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
                } else {
                    payload.numFieldInstancesToSkip--;
                }
            }
            return ts;
        }
        
        private static class PayloadData {
            byte[] data;
            int offset;
            int length;
            int numFieldInstancesToSkip;
            
            PayloadData(int skip, byte[] data, int offset, int length) {
                numFieldInstancesToSkip = skip;
                this.data = data;
                this.offset = offset;
                this.length = length;
            }
        }
    }

    
    /**
     * This Filter adds payloads to the tokens.
     */
    private static class PayloadFilter extends TokenFilter {
        private byte[] data;
        private int length;
        private int offset;
        Payload payload = new Payload();
        PayloadAttribute payloadAtt;
        
        public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
            super(in);
            this.data = data;
            this.length = length;
            this.offset = offset;
            payloadAtt = addAttribute(PayloadAttribute.class);
        }
        
        @Override
        public boolean incrementToken() throws IOException {
            boolean hasNext = input.incrementToken();
            if (hasNext) {
                if (offset + length <= data.length) {
                    Payload p = new Payload();
                    payloadAtt.setPayload(p);
                    p.setData(data, offset, length);
                    offset += length;                
                } else {
                    payloadAtt.setPayload(null);
                }
            }
            
            return hasNext;
        }
    }
    
    public void testThreadSafety() throws Exception {
        rnd = newRandom();
        final int numThreads = 5;
        final int numDocs = 50 * RANDOM_MULTIPLIER;
        final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
        
        Directory dir = newDirectory(rnd);
        final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(rnd, 
            TEST_VERSION_CURRENT, new MockAnalyzer()));
        final String field = "test";
        
        Thread[] ingesters = new Thread[numThreads];
        for (int i = 0; i < numThreads; i++) {
            ingesters[i] = new Thread() {
                @Override
                public void run() {
                    try {
                        for (int j = 0; j < numDocs; j++) {
                            Document d = new Document();
                            d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
                            writer.addDocument(d);
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                        fail(e.toString());
                    }
                }
            };
            ingesters[i].start();
        }
        
        for (int i = 0; i < numThreads; i++) {
          ingesters[i].join();
        }
        writer.close();
        IndexReader reader = IndexReader.open(dir, true);
        TermsEnum terms = MultiFields.getFields(reader).terms(field).iterator();
        Bits delDocs = MultiFields.getDeletedDocs(reader);
        DocsAndPositionsEnum tp = null;
        while (terms.next() != null) {
          String termText = terms.term().utf8ToString();
          tp = terms.docsAndPositions(delDocs, tp);
          while(tp.nextDoc() != DocsEnum.NO_MORE_DOCS) {
            int freq = tp.freq();
            for (int i = 0; i < freq; i++) {
              tp.nextPosition();
              final BytesRef payload = tp.getPayload();
              assertEquals(termText, pool.bytesToString(payload.bytes, payload.offset, payload.length));
            }
          }
        }
        reader.close();
        dir.close();
        assertEquals(pool.size(), numThreads);
    }
    
    private class PoolingPayloadTokenStream extends TokenStream {
        private byte[] payload;
        private boolean first;
        private ByteArrayPool pool;
        private String term;

        CharTermAttribute termAtt;
        PayloadAttribute payloadAtt;
        
        PoolingPayloadTokenStream(ByteArrayPool pool) {
            this.pool = pool;
            payload = pool.get();
            generateRandomData(payload);
            term = pool.bytesToString(payload, 0, payload.length);
            first = true;
            payloadAtt = addAttribute(PayloadAttribute.class);
            termAtt = addAttribute(CharTermAttribute.class);
        }
        
        @Override
        public boolean incrementToken() throws IOException {
            if (!first) return false;
            first = false;
            clearAttributes();
            termAtt.append(term);
            payloadAtt.setPayload(new Payload(payload));
            return true;
        }
        
        @Override
        public void close() throws IOException {
            pool.release(payload);
        }
        
    }
    
    private static class ByteArrayPool {
        private List<byte[]> pool;
        
        ByteArrayPool(int capacity, int size) {
            pool = new ArrayList<byte[]>();
            for (int i = 0; i < capacity; i++) {
                pool.add(new byte[size]);
            }
        }
        
        static String bytesToString(byte[] bytes, int start, int length) {
            String s = new String(bytes, start, length);
            BytesRef utf8Result = new BytesRef(10);
            UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result);
            try {
                return new String(utf8Result.bytes, 0, utf8Result.length, "UTF-8");
            } catch (UnsupportedEncodingException uee) {
                return null;
            }
        }
    
        synchronized byte[] get() {
            return pool.remove(0);
        }
        
        synchronized void release(byte[] b) {
            pool.add(b);
        }
        
        synchronized int size() {
            return pool.size();
        }
    }
}