TestJobControlCompiler.java example

Explorer
spork-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.test;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.net.URI;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import java.util.jar.JarOutputStream;
import java.util.zip.ZipEntry;

import javax.tools.JavaCompiler;
import javax.tools.JavaCompiler.CompilationTask;
import javax.tools.JavaFileObject;
import javax.tools.StandardJavaFileManager;
import javax.tools.ToolProvider;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.ExecType;
import org.apache.pig.FuncSpec;
import org.apache.pig.LoadFunc;
import org.apache.pig.PigServer;
import org.apache.pig.backend.hadoop.executionengine.JobCreationException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceOper;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.plans.MROperPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.plan.OperatorKey;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

public class TestJobControlCompiler {

    private static final Configuration CONF = new Configuration();


    @BeforeClass
    public static void setupClass() throws Exception {
        // creating a hadoop-site.xml and making it visible to Pig
        // making sure it is at the same location as for other tests to not pick
        // up a conf from a previous test
        File conf_dir = new File("build/classes");
        File hadoopSite = new File(conf_dir, "hadoop-site.xml");
        hadoopSite.deleteOnExit();
        FileWriter fw = new FileWriter(hadoopSite);
        try {
            fw.write("<?xml version=\"1.0\"?>\n");
            fw.write("<?xml-stylesheet type=\"text/xsl\" href=\"nutch-conf.xsl\"?>\n");
            fw.write("<configuration>\n");
            fw.write("</configuration>\n");
        } finally {
            fw.close();
        }
        // making hadoop-site.xml visible to Pig as it REQUIRES!!! one when
        // running in mapred mode
        Thread.currentThread().setContextClassLoader(
                new URLClassLoader(new URL[] { conf_dir.toURI().toURL() }));
    }
  /**
   * specifically tests that REGISTERED jars get added to distributed cache
   * @throws Exception
   */
  @Test
  public void testJarAddedToDistributedCache() throws Exception {

    // creating a jar with a UDF *not* in the current classloader
    File tmpFile = File.createTempFile("Some_", ".jar");
    tmpFile.deleteOnExit();
    String className = createTestJar(tmpFile);
    final String testUDFFileName = className+".class";

    // JobControlCompiler setup
    PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
    PigContext pigContext = pigServer.getPigContext();
    pigContext.connect();
    pigContext.addJar(tmpFile.getAbsolutePath());
    JobControlCompiler jobControlCompiler = new JobControlCompiler(pigContext, CONF);
    MROperPlan plan = new MROperPlan();
    MapReduceOper mro = new MapReduceOper(new OperatorKey());
    mro.UDFs = new HashSet<String>();
    mro.UDFs.add(className+"()");
    plan.add(mro);

    // compiling the job
    JobControl jobControl = jobControlCompiler.compile(plan , "test");
    JobConf jobConf = jobControl.getWaitingJobs().get(0).getJobConf();

    // verifying the jar gets on distributed cache
    Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
    // guava jar is not shipped with Hadoop 2.x
    Assert.assertEquals("size for "+Arrays.toString(fileClassPaths), HadoopShims.isHadoopYARN() ? 5 : 6, fileClassPaths.length);
    Path distributedCachePath = fileClassPaths[0];
    Assert.assertEquals("ends with jar name: "+distributedCachePath, distributedCachePath.getName(), tmpFile.getName());
    // hadoop bug requires path to not contain hdfs://hotname in front
    Assert.assertTrue("starts with /: "+distributedCachePath,
        distributedCachePath.toString().startsWith("/"));
    Assert.assertTrue("jar pushed to distributed cache should contain testUDF",
        jarContainsFileNamed(new File(fileClassPaths[0].toUri().getPath()), testUDFFileName));
  }

    private static List<File> createFiles(String... extensions)
            throws IOException {
        List<File> files = new ArrayList<File>();
        for (String extension : extensions) {
            File file = File.createTempFile("file", extension);
            file.deleteOnExit();
            files.add(file);
        }
        return files;
    }

    private static void assertFilesInDistributedCache(URI[] uris, int size,
            String... extensions) {
        Assert.assertEquals(size, uris.length);
        for (int i = 0; i < uris.length; i++) {
            Assert.assertTrue(uris[i].toString().endsWith(extensions[i]));
        }
    }

    @Test
    public void testAddArchiveToDistributedCache() throws IOException {
        final File textFile = File.createTempFile("file", ".txt");
        textFile.deleteOnExit();

        final List<File> zipArchives = createFiles(".zip");
        zipArchives.add(textFile);
        final List<File> tarArchives = createFiles(".tgz", ".tar.gz", ".tar");

        final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
        final PigContext pigContext = pigServer.getPigContext();
        pigContext.connect();
        pigContext.getProperties().put("pig.streaming.ship.files",
                StringUtils.join(zipArchives, ","));
        pigContext.getProperties().put("pig.streaming.cache.files",
                StringUtils.join(tarArchives, ","));

        final JobConf jobConf = compileTestJob(pigContext, CONF);

        URI[] uris = DistributedCache.getCacheFiles(jobConf);
        int sizeTxt = 0;
        for (int i = 0; i < uris.length; i++) {
            if (uris[i].toString().endsWith(".txt")) {
                sizeTxt++;
            }
        }
        Assert.assertTrue(sizeTxt == 1);
        assertFilesInDistributedCache(
                DistributedCache.getCacheArchives(jobConf), 4, ".zip", ".tgz",
                ".tar.gz", ".tar");
    }

    private JobConf compileTestJob(final PigContext pigContext, Configuration conf)
            throws JobCreationException {
        final JobControlCompiler jobControlCompiler = new JobControlCompiler(
                pigContext, conf);

        final MROperPlan plan = new MROperPlan();
        plan.add(new MapReduceOper(new OperatorKey()));

        final JobControl jobControl = jobControlCompiler.compile(plan, "test");
        final JobConf jobConf = jobControl.getWaitingJobs().get(0).getJobConf();
        return jobConf;
    }

    /**
     * Tests that no duplicate jars are added to distributed cache, which might cause conflicts
     * and tests with both symlinked and normal jar specification
     */
      @Test
      public void testNoDuplicateJarsInDistributedCache() throws Exception {

          // JobControlCompiler setup
          final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
          PigContext pigContext = pigServer.getPigContext();
          pigContext.connect();

          Configuration conf = new Configuration();
          DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf-0.jar#udf.jar")), conf, FileSystem.get(conf));
          DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf1.jar#diffname.jar")), conf, FileSystem.get(conf));
          DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf2.jar")), conf, FileSystem.get(conf));
          createAndAddResource("udf.jar", pigContext);
          createAndAddResource("udf1.jar", pigContext);
          createAndAddResource("udf2.jar", pigContext);
          createAndAddResource("another.jar", pigContext);

          final JobConf jobConf = compileTestJob(pigContext, conf);

          // verifying the jar gets on distributed cache
          URI[] cacheURIs = DistributedCache.getCacheFiles(jobConf);
          Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
          // expected - 1. udf.jar#udf.jar, 2. udf1.jar#diffname.jar 3. udf2.jar (same added twice)
          // 4. another.jar and 5. udf1.jar, and not duplicate udf.jar
          System.out.println("cache.files= " + Arrays.toString(cacheURIs));
          System.out.println("classpath.files= " + Arrays.toString(fileClassPaths));
          if (HadoopShims.isHadoopYARN()) {
              // Default jars - 5 (pig, antlr, joda-time, automaton)
              // Other jars - 10 (udf.jar#udf.jar, udf1.jar#diffname.jar, udf2.jar, udf1.jar, another.jar
              Assert.assertEquals("size 9 for " + Arrays.toString(cacheURIs), 9,
                      Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
              Assert.assertEquals("size 9 for " + Arrays.toString(fileClassPaths), 9,
                      Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
          } else {
              // Default jars - 5. Has guava in addition
              // There will be same entries duplicated for udf.jar and udf2.jar
              Assert.assertEquals("size 12 for " + Arrays.toString(cacheURIs), 12,
                      Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
              Assert.assertEquals("size 12 for " + Arrays.toString(fileClassPaths), 12,
                      Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
          }

          // Count occurrences of the resources
          Map<String, Integer> occurrences = new HashMap<String, Integer>();

          for (URI cacheURI : cacheURIs) {
              Integer val = occurrences.get(cacheURI.toString());
              val = (val == null) ? 1 : ++val;
              occurrences.put(cacheURI.toString(), val);
          }
          if (HadoopShims.isHadoopYARN()) {
              Assert.assertEquals(9, occurrences.size());
          } else {
              Assert.assertEquals(10, occurrences.size()); //guava jar in addition
          }

          for (String file : occurrences.keySet()) {
              if (!HadoopShims.isHadoopYARN() && (file.endsWith("udf.jar") || file.endsWith("udf2.jar"))) {
                  // Same path added twice which is ok. It should not be a shipped to hdfs temp path.
                  // We assert path is same by checking count
                  Assert.assertEquals("Two occurrences for " + file, 2, (int) occurrences.get(file));
              } else {
                  // check that only single occurrence even though we added once to dist cache (simulating via Oozie)
                  // and second time through pig register jar when there is symlink
                  Assert.assertEquals("One occurrence for " + file, 1, (int) occurrences.get(file));
              }
          }
      }

      private File createAndAddResource(String name, PigContext pigContext) throws IOException {
          File f = new File(name);
          f.createNewFile();
          f.deleteOnExit();
          pigContext.addJar(name);
          return f;
      }

    @Test
    public void testEstimateNumberOfReducers() throws Exception {
        Assert.assertEquals(2, JobControlCompiler.estimateNumberOfReducers(
            new Job(CONF), createMockPOLoadMapReduceOper(2L * 1000 * 1000 * 999)));

        Assert.assertEquals(2, JobControlCompiler.estimateNumberOfReducers(
            new Job(CONF), createMockPOLoadMapReduceOper(2L * 1000 * 1000 * 1000)));

        Assert.assertEquals(3, JobControlCompiler.estimateNumberOfReducers(
            new Job(CONF), createMockPOLoadMapReduceOper(2L * 1000 * 1000 * 1001)));
    }

    private static MapReduceOper createMockPOLoadMapReduceOper(long size) throws Exception {
        MapReduceOper mro = new MapReduceOper(new OperatorKey());
        mro.mapPlan.add(createPOLoadWithSize(size, new PigStorage()));
        return mro;
    }

    public static POLoad createPOLoadWithSize(long size, LoadFunc loadFunc) throws Exception {
        File file = File.createTempFile("tempFile", ".tmp");
        file.deleteOnExit();
        RandomAccessFile f = new RandomAccessFile(file, "rw");
        f.setLength(size);
        f.close();

        loadFunc.setLocation(file.getAbsolutePath(), new org.apache.hadoop.mapreduce.Job(CONF));
        FuncSpec funcSpec = new FuncSpec(loadFunc.getClass().getCanonicalName());
        POLoad poLoad = new POLoad(new OperatorKey(), loadFunc);
        poLoad.setLFile(new FileSpec(file.getAbsolutePath(), funcSpec));
        poLoad.setPc(new PigContext());
        poLoad.setUp();

        return poLoad;
    }

  /**
   * checks if the given file name is in the jar
   * @param jarFile the jar to check
   * @param name the name to find (full path in the jar)
   * @return true if the name was found
   * @throws IOException
   */
  private boolean jarContainsFileNamed(File jarFile, String name) throws IOException {
    Enumeration<JarEntry> entries = new JarFile(jarFile).entries();
    while (entries.hasMoreElements()) {
      JarEntry entry = entries.nextElement();
      if (entry.getName().equals(name)) {
        return true;
      }
    }
    return false;
  }

  /**
   * creates a jar containing a UDF not in the current classloader
   * @param jarFile the jar to create
   * @return the name of the class created (in the default package)
   * @throws IOException
   * @throws FileNotFoundException
   */
  private String createTestJar(File jarFile) throws IOException, FileNotFoundException {

    // creating the source .java file
    File javaFile = File.createTempFile("TestUDF", ".java");
    javaFile.deleteOnExit();
    String className = javaFile.getName().substring(0, javaFile.getName().lastIndexOf('.'));
    FileWriter fw = new FileWriter(javaFile);
    try {
      fw.write("import org.apache.pig.EvalFunc;\n");
      fw.write("import org.apache.pig.data.Tuple;\n");
      fw.write("import java.io.IOException;\n");
      fw.write("public class "+className+" extends EvalFunc<String> {\n");
      fw.write("  public String exec(Tuple input) throws IOException {\n");
      fw.write("    return \"test\";\n");
      fw.write("  }\n");
      fw.write("}\n");
    } finally {
      fw.close();
    }

    // compiling it
    JavaCompiler compiler = ToolProvider.getSystemJavaCompiler();
    StandardJavaFileManager fileManager = compiler.getStandardFileManager(null, null, null);
    Iterable<? extends JavaFileObject> compilationUnits1 = fileManager.getJavaFileObjects(javaFile);
    CompilationTask task = compiler.getTask(null, fileManager, null, null, null, compilationUnits1);
    task.call();

    // here is the compiled file
    File classFile = new File(javaFile.getParentFile(), className+".class");
    Assert.assertTrue(classFile.exists());

    // putting it in the jar
    JarOutputStream jos = new JarOutputStream(new FileOutputStream(jarFile));
    try {
      jos.putNextEntry(new ZipEntry(classFile.getName()));
      try {
        InputStream testClassContentIS = new FileInputStream(classFile);
        try {
          byte[] buffer = new byte[64000];
          int n;
          while ((n = testClassContentIS.read(buffer)) != -1) {
            jos.write(buffer, 0, n);
          }
        } finally {
          testClassContentIS.close();
        }
      }finally {
        jos.closeEntry();
      }
    } finally {
      jos.close();
    }

    return className;
  }
}