TestIndexUpdater.java example

Explorer

bigpetstore-master
- hadoop-1.2.1
  - src
- src
  - integration
    - java
      - org
        bigtop
        bigpetstore
        integration
        BigPetStoreHiveIT.java
        BigPetStoreMahoutIT.java
        BigPetStorePigIT.java
        ITUtils.java
  - main
    - java
      - org
        bigtop
        bigpetstore
        clustering
        BPSRecommnder.java
        MahoutClusterTransactionsByRegion.java
        contract
        PetStoreStatistics.java
        etl
        CrunchETL.java
        HiveViewCreator.java
        LineItem.java
        PigCSVCleaner.java
        generator
        BPSGenerator.java
        GeneratePetStoreTransactionsInputFormat.java
        PetStoreTransaction.java
        PetStoreTransactionInputSplit.java
        TransactionIteratorFactory.java
        util
        BigPetStoreConstants.java
        DeveloperTools.java
        NumericalIdUtils.java
        Pair.java
        PetStoreParseFunctions.java
        StringUtils.java
  - test
    - java
      - org
        bigtop
        bigpetstore
        docs
        TestDocs.java
        generator
        TestNumericalIdUtils.java
        TestPetStoreTransactionGeneratorJob.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.contrib.index.mapred;

import java.io.File;
import java.io.IOException;
import java.text.NumberFormat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;

import junit.framework.TestCase;

public class TestIndexUpdater extends TestCase {

  private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
  }

  // however, "we only allow 0 or 1 reducer in local mode" - from
  // LocalJobRunner
  private Configuration conf;
  private Path localInputPath = new Path(System.getProperty("build.test") + "/sample/data.txt");
  private Path inputPath = new Path("/myexample/data.txt");
  private Path outputPath = new Path("/myoutput");
  private Path indexPath = new Path("/myindex");
  private int initNumShards = 3;
  private int numMapTasks = 5;

  private int numDataNodes = 3;
  private int numTaskTrackers = 3;

  private int numRuns = 3;
  private int numDocsPerRun = 10; // num of docs in local input path

  private FileSystem fs;
  private MiniDFSCluster dfsCluster;
  private MiniMRCluster mrCluster;

  public TestIndexUpdater() throws IOException {
    super();
    if (System.getProperty("hadoop.log.dir") == null) {
      String base = new File(".").getPath(); // getAbsolutePath();
      System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs");
    }
    conf = new Configuration();
  }

  protected void setUp() throws Exception {
    super.setUp();
    try {
      dfsCluster =
          new MiniDFSCluster(conf, numDataNodes, true, (String[]) null);

      fs = dfsCluster.getFileSystem();
      if (fs.exists(inputPath)) {
        fs.delete(inputPath);
      }
      fs.copyFromLocalFile(localInputPath, inputPath);

      if (fs.exists(outputPath)) {
        // do not create, mapred will create
        fs.delete(outputPath);
      }

      if (fs.exists(indexPath)) {
        fs.delete(indexPath);
      }

      mrCluster =
          new MiniMRCluster(numTaskTrackers, fs.getUri().toString(), 1);

    } catch (IOException e) {
      if (dfsCluster != null) {
        dfsCluster.shutdown();
        dfsCluster = null;
      }

      if (fs != null) {
        fs.close();
        fs = null;
      }

      if (mrCluster != null) {
        mrCluster.shutdown();
        mrCluster = null;
      }

      throw e;
    }

  }

  protected void tearDown() throws Exception {
    if (dfsCluster != null) {
      dfsCluster.shutdown();
      dfsCluster = null;
    }

    if (fs != null) {
      fs.close();
      fs = null;
    }

    if (mrCluster != null) {
      mrCluster.shutdown();
      mrCluster = null;
    }

    super.tearDown();
  }

  public void testIndexUpdater() throws IOException {
    IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
    // max field length, compound file and number of segments will be checked
    // later
    iconf.setIndexMaxFieldLength(2);
    iconf.setIndexUseCompoundFile(true);
    iconf.setIndexMaxNumSegments(1);

    long versionNumber = -1;
    long generation = -1;

    for (int i = 0; i < numRuns; i++) {
      if (fs.exists(outputPath)) {
        fs.delete(outputPath);
      }

      Shard[] shards = new Shard[initNumShards + i];
      for (int j = 0; j < shards.length; j++) {
        shards[j] =
            new Shard(versionNumber, new Path(indexPath,
                NUMBER_FORMAT.format(j)).toString(), generation);
      }
      run(i + 1, shards);
    }
  }

  private void run(int numRuns, Shard[] shards) throws IOException {
    IIndexUpdater updater = new IndexUpdater();
    updater.run(conf, new Path[] { inputPath }, outputPath, numMapTasks,
        shards);

    // verify the done files
    Path[] doneFileNames = new Path[shards.length];
    int count = 0;
    FileStatus[] fileStatus = fs.listStatus(outputPath);
    for (int i = 0; i < fileStatus.length; i++) {
      FileStatus[] doneFiles = fs.listStatus(fileStatus[i].getPath());
      for (int j = 0; j < doneFiles.length; j++) {
        doneFileNames[count++] = doneFiles[j].getPath();
      }
    }
    assertEquals(shards.length, count);
    for (int i = 0; i < count; i++) {
      assertTrue(doneFileNames[i].getName().startsWith(
          IndexUpdateReducer.DONE.toString()));
    }

    // verify the index
    IndexReader[] readers = new IndexReader[shards.length];
    for (int i = 0; i < shards.length; i++) {
      Directory dir =
          new FileSystemDirectory(fs, new Path(shards[i].getDirectory()),
              false, conf);
      readers[i] = IndexReader.open(dir);
    }

    IndexReader reader = new MultiReader(readers);
    IndexSearcher searcher = new IndexSearcher(reader);
    Hits hits = searcher.search(new TermQuery(new Term("content", "apache")));

    assertEquals(numRuns * numDocsPerRun, hits.length());

    int[] counts = new int[numDocsPerRun];
    for (int i = 0; i < hits.length(); i++) {
      Document doc = hits.doc(i);
      counts[Integer.parseInt(doc.get("id"))]++;
    }

    for (int i = 0; i < numDocsPerRun; i++) {
      assertEquals(numRuns, counts[i]);
    }

    // max field length is 2, so "dot" is also indexed but not "org"
    hits = searcher.search(new TermQuery(new Term("content", "dot")));
    assertEquals(numRuns, hits.length());

    hits = searcher.search(new TermQuery(new Term("content", "org")));
    assertEquals(0, hits.length());

    searcher.close();
    reader.close();

    // open and close an index writer with KeepOnlyLastCommitDeletionPolicy
    // to remove earlier checkpoints
    for (int i = 0; i < shards.length; i++) {
      Directory dir =
          new FileSystemDirectory(fs, new Path(shards[i].getDirectory()),
              false, conf);
      IndexWriter writer =
          new IndexWriter(dir, false, null,
              new KeepOnlyLastCommitDeletionPolicy());
      writer.close();
    }

    // verify the number of segments, must be done after an writer with
    // KeepOnlyLastCommitDeletionPolicy so that earlier checkpoints are removed
    for (int i = 0; i < shards.length; i++) {
      PathFilter cfsFilter = new PathFilter() {
        public boolean accept(Path path) {
          return path.getName().endsWith(".cfs");
        }
      };
      FileStatus[] cfsFiles =
          fs.listStatus(new Path(shards[i].getDirectory()), cfsFilter);
      assertEquals(1, cfsFiles.length);
    }
  }

}