MorphlineBasicMiniMRTest.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.hadoop;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.lang.reflect.Array;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.security.authorize.ProxyUsers;
import org.apache.hadoop.util.JarFinder;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.cloud.AbstractZkTestCase;
import org.apache.solr.hadoop.hack.MiniMRCluster;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.util.BadHdfsThreadsFilter;
import org.apache.solr.util.BadMrClusterThreadsFilter;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;

import com.carrotsearch.randomizedtesting.annotations.Nightly;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakAction.Action;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakZombies.Consequence;

@ThreadLeakAction({Action.WARN})
@ThreadLeakLingering(linger = 0)
@ThreadLeakZombies(Consequence.CONTINUE)
@ThreadLeakFilters(defaultFilters = true, filters = {
    BadHdfsThreadsFilter.class, BadMrClusterThreadsFilter.class // hdfs currently leaks thread(s)
})
@Slow
@Nightly
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9076")
public class MorphlineBasicMiniMRTest extends SolrTestCaseJ4 {
  
  private static final boolean ENABLE_LOCAL_JOB_RUNNER = false; // for debugging only
  private static final String RESOURCES_DIR = getFile("morphlines-core.marker").getParent();  
  private static final String DOCUMENTS_DIR = RESOURCES_DIR + "/test-documents";
  private static final File MINIMR_CONF_DIR = new File(RESOURCES_DIR + "/solr/minimr");
  
  private static String SEARCH_ARCHIVES_JAR;

  private static MiniDFSCluster dfsCluster = null;
  private static MiniMRCluster mrCluster = null;
  private static int numRuns = 0;

  private final String inputAvroFile;
  private final int count;
  
  private static String tempDir;
  
  private static File solrHomeDirectory;
  
  protected MapReduceIndexerTool createTool() {
    return new MapReduceIndexerTool();
  }

  public MorphlineBasicMiniMRTest() {
    int data = random().nextInt(3);
    switch (data) {
      case 0:
        this.inputAvroFile = "sample-statuses-20120906-141433.avro";
        this.count = 2;
        break;
      case 1:
        this.inputAvroFile = "sample-statuses-20120521-100919.avro";
        this.count = 20;
        break;
      case 2:
      this.inputAvroFile = "sample-statuses-20120906-141433-medium.avro";
      this.count = 2104;
        break;
      default:
        throw new RuntimeException("Test setup is broken");
    }

  }

  @BeforeClass
  public static void setupClass() throws Exception {
    solrHomeDirectory = createTempDir().toFile();
    
    assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
        Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));
    
    assumeFalse("This test fails on Java 9 (https://issues.apache.org/jira/browse/SOLR-8876)", Constants.JRE_IS_MINIMUM_JAVA9);
    assumeFalse("FIXME: This test does not work with Windows because of native library requirements", Constants.WINDOWS);
    
    AbstractZkTestCase.SOLRHOME = solrHomeDirectory;
    FileUtils.copyDirectory(MINIMR_CONF_DIR, solrHomeDirectory);
    File dataDir = createTempDir().toFile();
    tempDir = dataDir.getAbsolutePath();
    new File(tempDir).mkdirs();
    FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
    
    AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
    
    System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath());
    
    int taskTrackers = 1;
    int dataNodes = 2;
//    String proxyUser = System.getProperty("user.name");
//    String proxyGroup = "g";
//    StringBuilder sb = new StringBuilder();
//    sb.append("127.0.0.1,localhost");
//    for (InetAddress i : InetAddress.getAllByName(InetAddress.getLocalHost().getHostName())) {
//      sb.append(",").append(i.getCanonicalHostName());
//    }
    
    new File(dataDir, "nm-local-dirs").mkdirs();
    
    System.setProperty("solr.hdfs.blockcache.enabled", "false");
    
    System.setProperty("test.build.dir", dataDir + File.separator + "hdfs" + File.separator + "test-build-dir");
    System.setProperty("test.build.data", dataDir + File.separator + "hdfs" + File.separator + "build");
    System.setProperty("test.cache.data", dataDir + File.separator + "hdfs" + File.separator + "cache");

    // Initialize AFTER test.build.dir is set, JarFinder uses it.
    SEARCH_ARCHIVES_JAR = JarFinder.getJar(MapReduceIndexerTool.class);

    JobConf conf = new JobConf();
    conf.set("dfs.block.access.token.enable", "false");
    conf.set("dfs.permissions", "true");
    conf.set("hadoop.security.authentication", "simple");
    conf.set(YarnConfiguration.NM_LOCAL_DIRS, dataDir.getPath() + File.separator +  "nm-local-dirs");
    conf.set(YarnConfiguration.DEFAULT_NM_LOG_DIRS, dataDir + File.separator +  "nm-logs");
    conf.set("testWorkDir", dataDir.getPath() + File.separator +  "testWorkDir");
    conf.set("mapreduce.jobhistory.minicluster.fixed.ports", "false");
    conf.set("mapreduce.jobhistory.admin.address", "0.0.0.0:0");

    dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
    FileSystem fileSystem = dfsCluster.getFileSystem();
    fileSystem.mkdirs(new Path("/tmp"));
    fileSystem.mkdirs(new Path("/user"));
    fileSystem.mkdirs(new Path("/hadoop/mapred/system"));
    fileSystem.setPermission(new Path("/tmp"), FsPermission.valueOf("-rwxrwxrwx"));
    fileSystem.setPermission(new Path("/user"), FsPermission.valueOf("-rwxrwxrwx"));
    fileSystem.setPermission(new Path("/hadoop/mapred/system"), FsPermission.valueOf("-rwx------"));
    String nnURI = fileSystem.getUri().toString();
    int numDirs = 1;
    String[] racks = null;
    String[] hosts = null;

    mrCluster = new MiniMRCluster(0, 0, taskTrackers, nnURI, numDirs, racks, hosts, null, conf);
    ProxyUsers.refreshSuperUserGroupsConfiguration(conf);
  }

  @AfterClass
  public static void teardownClass() throws Exception {
    System.clearProperty("solr.hdfs.blockcache.enabled");
    System.clearProperty("test.build.dir");
    System.clearProperty("test.build.data");
    System.clearProperty("test.cache.data");

    if (mrCluster != null) {
      mrCluster.shutdown();
      mrCluster = null;
    }
    if (dfsCluster != null) {
      dfsCluster.shutdown();
      dfsCluster = null;
    }
    
    FileSystem.closeAll();
  }
  
  @After
  public void tearDown() throws Exception {
    System.clearProperty("hadoop.log.dir");
    System.clearProperty("solr.hdfs.blockcache.enabled");
    
    super.tearDown();
  }

  private JobConf getJobConf() {
    return mrCluster.createJobConf();
  }

  @Test
  public void testPathParts() throws Exception { // see PathParts
    FileSystem fs = dfsCluster.getFileSystem();
    int dfsClusterPort = fs.getWorkingDirectory().toUri().getPort();
    assertTrue(dfsClusterPort > 0);
    JobConf jobConf = getJobConf();
    Configuration simpleConf = new Configuration();
    
    for (Configuration conf : Arrays.asList(jobConf, simpleConf)) {
      for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) {
        for (String up : Arrays.asList("", "../")) {
          String down = up.length() == 0 ? "foo/" : "";
          String uploadURL = "hdfs://localhost:12345/user/foo/" + up + "bar.txt" + queryAndFragment;
          PathParts parts = new PathParts(uploadURL, conf);
          assertEquals(uploadURL, parts.getUploadURL());
          assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
          assertEquals("bar.txt", parts.getName());
          assertEquals("hdfs", parts.getScheme());
          assertEquals("localhost", parts.getHost());
          assertEquals(12345, parts.getPort());
          assertEquals("hdfs://localhost:12345/user/" + down + "bar.txt", parts.getId());
          assertEquals(parts.getId(), parts.getDownloadURL());
          assertFileNotFound(parts);
    
          uploadURL = "hdfs://localhost/user/foo/" + up + "bar.txt" + queryAndFragment;
          parts = new PathParts(uploadURL, conf);
          assertEquals(uploadURL, parts.getUploadURL());
          assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
          assertEquals("bar.txt", parts.getName());
          assertEquals("hdfs", parts.getScheme());
          assertEquals("localhost", parts.getHost());
          assertEquals(8020, parts.getPort());
          assertEquals("hdfs://localhost:8020/user/" + down + "bar.txt", parts.getId());
          assertEquals(parts.getId(), parts.getDownloadURL());
          assertFileNotFound(parts);
        }
      }
    }    

    for (Configuration conf : Arrays.asList(jobConf)) {
      for (String queryAndFragment : Arrays.asList("", "?key=value#fragment")) {
        for (String up : Arrays.asList("", "../")) {
          // verify using absolute path
          String down = up.length() == 0 ? "foo/" : "";
          String uploadURL = "/user/foo/" + up + "bar.txt" + queryAndFragment;
          PathParts parts = new PathParts(uploadURL, conf);
          assertEquals(uploadURL, parts.getUploadURL());
          assertEquals("/user/" + down + "bar.txt", parts.getURIPath());
          assertEquals("bar.txt", parts.getName());
          assertEquals("hdfs", parts.getScheme());
          assertTrue("localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost()));
          assertEquals(dfsClusterPort, parts.getPort());
          assertTrue(parts.getId().equals("hdfs://localhost:" + dfsClusterPort + "/user/" + down + "bar.txt")
                  || parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + "/user/" + down + "bar.txt")
          );
          assertFileNotFound(parts);          
          
          // verify relative path is interpreted to be relative to user's home dir and resolved to an absolute path
          uploadURL = "xuser/foo/" + up + "bar.txt" + queryAndFragment;
          parts = new PathParts(uploadURL, conf);
          assertEquals(uploadURL, parts.getUploadURL());
          String homeDir = "/user/" + System.getProperty("user.name");
          assertEquals(homeDir + "/xuser/" + down + "bar.txt", parts.getURIPath());
          assertEquals("bar.txt", parts.getName());
          assertEquals("hdfs", parts.getScheme());
          assertTrue("localhost".equals(parts.getHost()) || "localhost.localdomain".equals(parts.getHost()));
          assertEquals(dfsClusterPort, parts.getPort());
          assertTrue(parts.getId().equals("hdfs://localhost:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt")
                  || parts.getId().equals("hdfs://localhost.localdomain:" + dfsClusterPort + homeDir + "/xuser/" + down + "bar.txt")
          );
          assertFileNotFound(parts);
        }
      }
    }
    
    try {
      new PathParts("/user/foo/bar.txt", simpleConf);
      fail("host/port resolution requires minimr conf, not a simple conf");
    } catch (IllegalArgumentException e) {
      ; // expected
    }    
  }

  private void assertFileNotFound(PathParts parts) {
    try {
      parts.getFileSystem().getFileStatus(parts.getUploadPath());
      fail();
    } catch (IOException e) {
      ; // expected
    }
  }

  @Test
  public void mrRun() throws Exception {
    FileSystem fs = dfsCluster.getFileSystem();
    Path inDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/input"));
    fs.delete(inDir, true);
    String DATADIR = "/user/testing/testMapperReducer/data";
    Path dataDir = fs.makeQualified(new Path(DATADIR));
    fs.delete(dataDir, true);
    Path outDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/output"));
    fs.delete(outDir, true);

    assertTrue(fs.mkdirs(inDir));
    Path INPATH = new Path(inDir, "input.txt");
    OutputStream os = fs.create(INPATH);
    Writer wr = new OutputStreamWriter(os, StandardCharsets.UTF_8);
    wr.write(DATADIR + "/" + inputAvroFile);
    wr.close();

    assertTrue(fs.mkdirs(dataDir));
    fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, inputAvroFile), dataDir);
    
    JobConf jobConf = getJobConf();
    jobConf.set("jobclient.output.filter", "ALL");
    if (ENABLE_LOCAL_JOB_RUNNER) { // enable Hadoop LocalJobRunner; this enables to run in debugger and set breakpoints
      jobConf.set("mapred.job.tracker", "local");
    }
    jobConf.setMaxMapAttempts(1);
    jobConf.setMaxReduceAttempts(1);
    jobConf.setJar(SEARCH_ARCHIVES_JAR);
    
    int shards = 2;
    int maxReducers = Integer.MAX_VALUE;
    if (ENABLE_LOCAL_JOB_RUNNER) {
      // local job runner has a couple of limitations: only one reducer is supported and the DistributedCache doesn't work.
      // see http://blog.cloudera.com/blog/2009/07/advice-on-qa-testing-your-mapreduce-jobs/
      maxReducers = 1;
      shards = 1;
    }
    
    String[] args = new String[] {
        "--morphline-file=" + tempDir + "/test-morphlines/solrCellDocumentTypes.conf",
        "--morphline-id=morphline1",
        "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
        "--output-dir=" + outDir.toString(),
        "--shards=" + shards,
        "--verbose",
        numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(),
        numRuns % 3 == 0 ? "--reducers=" + shards : (numRuns % 3 == 1  ? "--reducers=-1" : "--reducers=" + Math.min(8, maxReducers))
    };
    if (numRuns % 3 == 2) {
      args = concat(args, new String[] {"--fanout=2"});
    }
    if (numRuns == 0) {
      // force (slow) MapReduce based randomization to get coverage for that as well
      args = concat(new String[] {"-D", MapReduceIndexerTool.MAIN_MEMORY_RANDOMIZATION_THRESHOLD + "=-1"}, args); 
    }
    MapReduceIndexerTool tool = createTool();
    int res = ToolRunner.run(jobConf, tool, args);
    assertEquals(0, res);
    Job job = tool.job;
    assertTrue(job.isComplete());
    assertTrue(job.isSuccessful());

    if (numRuns % 3 != 2) {
      // Only run this check if mtree merge is disabled.
      // With mtree merge enabled the BatchWriter counters aren't available anymore because 
      // variable "job" now refers to the merge job rather than the indexing job
      assertEquals("Invalid counter " + SolrRecordWriter.class.getName() + "." + SolrCounters.DOCUMENTS_WRITTEN,
          count, job.getCounters().findCounter(SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString()).getValue());
    }
    
    // Check the output is as expected
    outDir = new Path(outDir, MapReduceIndexerTool.RESULTS_DIR);
    Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(outDir));

    System.out.println("outputfiles:" + Arrays.toString(outputFiles));

    UtilsForTests.validateSolrServerDocumentCount(MINIMR_CONF_DIR, fs, outDir, count, shards);
    
    // run again with --dryrun mode:  
    tool = createTool();
    args = concat(args, new String[] {"--dry-run"});
    res = ToolRunner.run(jobConf, tool, args);
    assertEquals(0, res);

    numRuns++;
  }
  
  protected static <T> T[] concat(T[]... arrays) {
    if (arrays.length <= 0) {
      throw new IllegalArgumentException();
    }
    Class clazz = null;
    int length = 0;
    for (T[] array : arrays) {
      clazz = array.getClass();
      length += array.length;
    }
    T[] result = (T[]) Array.newInstance(clazz.getComponentType(), length);
    int pos = 0;
    for (T[] array : arrays) {
      System.arraycopy(array, 0, result, pos, array.length);
      pos += array.length;
    }
    return result;
  }

}