TestPerFieldPostingsFormat2.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.perfield;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval;
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.Test;

/**
 * 
 *
 */
//TODO: would be better in this test to pull termsenums and instanceof or something?
// this way we can verify PFPF is doing the right thing.
// for now we do termqueries.
public class TestPerFieldPostingsFormat2 extends LuceneTestCase {

  private IndexWriter newWriter(Directory dir, IndexWriterConfig conf)
      throws IOException {
    LogDocMergePolicy logByteSizeMergePolicy = new LogDocMergePolicy();
    logByteSizeMergePolicy.setNoCFSRatio(0.0); // make sure we use plain
    // files
    conf.setMergePolicy(logByteSizeMergePolicy);

    final IndexWriter writer = new IndexWriter(dir, conf);
    return writer;
  }

  private void addDocs(IndexWriter writer, int numDocs) throws IOException {
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
      doc.add(newTextField("content", "aaa", Field.Store.NO));
      writer.addDocument(doc);
    }
  }

  private void addDocs2(IndexWriter writer, int numDocs) throws IOException {
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
      doc.add(newTextField("content", "bbb", Field.Store.NO));
      writer.addDocument(doc);
    }
  }

  private void addDocs3(IndexWriter writer, int numDocs) throws IOException {
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
      doc.add(newTextField("content", "ccc", Field.Store.NO));
      doc.add(newStringField("id", "" + i, Field.Store.YES));
      writer.addDocument(doc);
    }
  }

  /*
   * Test that heterogeneous index segments are merge successfully
   */
  @Test
  public void testMergeUnusedPerFieldCodec() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random()))
                                 .setOpenMode(OpenMode.CREATE).setCodec(new MockCodec());
    IndexWriter writer = newWriter(dir, iwconf);
    addDocs(writer, 10);
    writer.commit();
    addDocs3(writer, 10);
    writer.commit();
    addDocs2(writer, 10);
    writer.commit();
    assertEquals(30, writer.maxDoc());
    TestUtil.checkIndex(dir);
    writer.forceMerge(1);
    assertEquals(30, writer.maxDoc());
    writer.close();
    dir.close();
  }

  /*
   * Test that heterogeneous index segments are merged sucessfully
   */
  // TODO: not sure this test is that great, we should probably peek inside PerFieldPostingsFormat or something?!
  @Test
  public void testChangeCodecAndMerge() throws IOException {
    Directory dir = newDirectory();
    if (VERBOSE) {
      System.out.println("TEST: make new index");
    }
    IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random()))
                                 .setOpenMode(OpenMode.CREATE).setCodec(new MockCodec());
    iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
    IndexWriter writer = newWriter(dir, iwconf);

    addDocs(writer, 10);
    writer.commit();
    assertQuery(new Term("content", "aaa"), dir, 10);
    if (VERBOSE) {
      System.out.println("TEST: addDocs3");
    }
    addDocs3(writer, 10);
    writer.commit();
    writer.close();

    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "aaa"), dir, 10);
    Codec codec = iwconf.getCodec();

    iwconf = newIndexWriterConfig(new MockAnalyzer(random()))
        .setOpenMode(OpenMode.APPEND).setCodec(codec);
    //((LogMergePolicy) iwconf.getMergePolicy()).setNoCFSRatio(0.0);
    //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
    iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);

    iwconf.setCodec(new MockCodec2()); // uses standard for field content
    writer = newWriter(dir, iwconf);
    // swap in new codec for currently written segments
    if (VERBOSE) {
      System.out.println("TEST: add docs w/ Standard codec for content field");
    }
    addDocs2(writer, 10);
    writer.commit();
    codec = iwconf.getCodec();
    assertEquals(30, writer.maxDoc());
    assertQuery(new Term("content", "bbb"), dir, 10);
    assertQuery(new Term("content", "ccc"), dir, 10);   ////
    assertQuery(new Term("content", "aaa"), dir, 10);

    if (VERBOSE) {
      System.out.println("TEST: add more docs w/ new codec");
    }
    addDocs2(writer, 10);
    writer.commit();
    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "bbb"), dir, 20);
    assertQuery(new Term("content", "aaa"), dir, 10);
    assertEquals(40, writer.maxDoc());

    if (VERBOSE) {
      System.out.println("TEST: now optimize");
    }
    writer.forceMerge(1);
    assertEquals(40, writer.maxDoc());
    writer.close();
    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "bbb"), dir, 20);
    assertQuery(new Term("content", "aaa"), dir, 10);

    dir.close();
  }

  public void assertQuery(Term t, Directory dir, int num)
      throws IOException {
    if (VERBOSE) {
      System.out.println("\nTEST: assertQuery " + t);
    }
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = newSearcher(reader);
    TopDocs search = searcher.search(new TermQuery(t), num + 10);
    assertEquals(num, search.totalHits);
    reader.close();

  }

  public static class MockCodec extends AssertingCodec {
    final PostingsFormat luceneDefault = TestUtil.getDefaultPostingsFormat();
    final PostingsFormat direct = new DirectPostingsFormat();
    final PostingsFormat memory = new MemoryPostingsFormat();
    
    @Override
    public PostingsFormat getPostingsFormatForField(String field) {
      if (field.equals("id")) {
        return direct;
      } else if (field.equals("content")) {
        return memory;
      } else {
        return luceneDefault;
      }
    }
  }

  public static class MockCodec2 extends AssertingCodec {
    final PostingsFormat luceneDefault = TestUtil.getDefaultPostingsFormat();
    final PostingsFormat direct = new DirectPostingsFormat();
    
    @Override
    public PostingsFormat getPostingsFormatForField(String field) {
      if (field.equals("id")) {
        return direct;
      } else {
        return luceneDefault;
      }
    }
  }

  /*
   * Test per field codec support - adding fields with random codecs
   */
  @Test
  public void testStressPerFieldCodec() throws IOException {
    Directory dir = newDirectory(random());
    final int docsPerRound = 97;
    int numRounds = atLeast(1);
    for (int i = 0; i < numRounds; i++) {
      int num = TestUtil.nextInt(random(), 30, 60);
      IndexWriterConfig config = newIndexWriterConfig(random(),
          new MockAnalyzer(random()));
      config.setOpenMode(OpenMode.CREATE_OR_APPEND);
      IndexWriter writer = newWriter(dir, config);
      for (int j = 0; j < docsPerRound; j++) {
        final Document doc = new Document();
        for (int k = 0; k < num; k++) {
          FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
          customType.setTokenized(random().nextBoolean());
          customType.setOmitNorms(random().nextBoolean());
          Field field = newField("" + k, TestUtil
              .randomRealisticUnicodeString(random(), 128), customType);
          doc.add(field);
        }
        writer.addDocument(doc);
      }
      if (random().nextBoolean()) {
        writer.forceMerge(1);
      }
      writer.commit();
      assertEquals((i + 1) * docsPerRound, writer.maxDoc());
      writer.close();
    }
    dir.close();
  }

  public void testSameCodecDifferentInstance() throws Exception {
    Codec codec = new AssertingCodec() {
      @Override
      public PostingsFormat getPostingsFormatForField(String field) {
        if ("id".equals(field)) {
          return new MemoryPostingsFormat();
        } else if ("date".equals(field)) {
          return new MemoryPostingsFormat();
        } else {
          return super.getPostingsFormatForField(field);
        }
      }
    };
    doTestMixedPostings(codec);
  }
  
  public void testSameCodecDifferentParams() throws Exception {
    Codec codec = new AssertingCodec() {
      @Override
      public PostingsFormat getPostingsFormatForField(String field) {
        if ("id".equals(field)) {
          return new LuceneVarGapFixedInterval(1);
        } else if ("date".equals(field)) {
          return new LuceneVarGapFixedInterval(2);
        } else {
          return super.getPostingsFormatForField(field);
        }
      }
    };
    doTestMixedPostings(codec);
  }
  
  private void doTestMixedPostings(Codec codec) throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setCodec(codec);
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    // turn on vectors for the checkindex cross-check
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectorPositions(true);
    Field idField = new Field("id", "", ft);
    Field dateField = new Field("date", "", ft);
    doc.add(idField);
    doc.add(dateField);
    for (int i = 0; i < 100; i++) {
      idField.setStringValue(Integer.toString(random().nextInt(50)));
      dateField.setStringValue(Integer.toString(random().nextInt(100)));
      iw.addDocument(doc);
    }
    iw.close();
    dir.close(); // checkindex
  }

  @SuppressWarnings("deprecation")
  public void testMergeCalledOnTwoFormats() throws IOException {
    MergeRecordingPostingsFormatWrapper pf1 = new MergeRecordingPostingsFormatWrapper(TestUtil.getDefaultPostingsFormat());
    MergeRecordingPostingsFormatWrapper pf2 = new MergeRecordingPostingsFormatWrapper(TestUtil.getDefaultPostingsFormat());

    IndexWriterConfig iwc = new IndexWriterConfig();
    iwc.setCodec(new AssertingCodec() {
      @Override
      public PostingsFormat getPostingsFormatForField(String field) {
        switch (field) {
          case "f1":
          case "f2":
            return pf1;

          case "f3":
          case "f4":
            return pf2;

          default:
            return super.getPostingsFormatForField(field);
        }
      }
    });

    Directory directory = newDirectory();

    IndexWriter iwriter = new IndexWriter(directory, iwc);

    Document doc = new Document();
    doc.add(new StringField("f1", "val1", Field.Store.NO));
    doc.add(new StringField("f2", "val2", Field.Store.YES));
    doc.add(new IntPoint("f3", 3)); // Points are not indexed as postings and should not appear in the merge fields
    doc.add(new StringField("f4", "val4", Field.Store.NO));
    iwriter.addDocument(doc);
    iwriter.commit();

    doc = new Document();
    doc.add(new StringField("f1", "val5", Field.Store.NO));
    doc.add(new StringField("f2", "val6", Field.Store.YES));
    doc.add(new IntPoint("f3", 7));
    doc.add(new StringField("f4", "val8", Field.Store.NO));
    iwriter.addDocument(doc);
    iwriter.commit();

    iwriter.forceMerge(1, true);
    iwriter.close();

    assertEquals(1, pf1.nbMergeCalls);
    assertEquals(new HashSet<>(Arrays.asList("f1", "f2")), new HashSet<>(pf1.fieldNames));
    assertEquals(1, pf2.nbMergeCalls);
    assertEquals(Collections.singletonList("f4"), pf2.fieldNames);

    directory.close();
  }

  private static final class MergeRecordingPostingsFormatWrapper extends PostingsFormat {
    private final PostingsFormat delegate;
    final List<String> fieldNames = new ArrayList<>();
    int nbMergeCalls = 0;

    MergeRecordingPostingsFormatWrapper(PostingsFormat delegate) {
      super(delegate.getName());
      this.delegate = delegate;
    }

    @Override
    public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
      final FieldsConsumer consumer = delegate.fieldsConsumer(state);
      return new FieldsConsumer() {
        @Override
        public void write(Fields fields) throws IOException {
          consumer.write(fields);
        }

        @Override
        public void merge(MergeState mergeState) throws IOException {
          nbMergeCalls++;
          for (FieldInfo fi : mergeState.mergeFieldInfos) {
            fieldNames.add(fi.name);
          }
          consumer.merge(mergeState);
        }

        @Override
        public void close() throws IOException {
          consumer.close();
        }
      };
    }

    @Override
    public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
      return delegate.fieldsProducer(state);
    }
  }
}