BlurMapReduceUtil.java example

Explorer
incubator-blur-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.blur.mapreduce.lib;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.blur.log.Log;
import org.apache.blur.log.LogFactory;
import org.apache.blur.lucene.security.DocumentVisibility;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.StringUtils;

/**
 * This utility code was taken from HBase to locate classes and the jars files
 * to add to the MapReduce Job.
 */
public class BlurMapReduceUtil {

  private final static Log LOG = LogFactory.getLog(BlurMapReduceUtil.class);

  /**
   * Add the Blur dependency jars as well as jars for any of the configured job
   * classes to the job configuration, so that JobClient will ship them to the
   * cluster and add them to the DistributedCache.
   */
  public static void addDependencyJars(Job job) throws IOException {
    try {
      addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class, job.getMapOutputKeyClass(),
          job.getMapOutputValueClass(), job.getInputFormatClass(), job.getOutputKeyClass(), job.getOutputValueClass(),
          job.getOutputFormatClass(), job.getPartitionerClass(), job.getCombinerClass(), DocumentVisibility.class);
      addAllJarsInBlurLib(job.getConfiguration());
    } catch (ClassNotFoundException e) {
      throw new IOException(e);
    }
  }

  /**
   * Adds all the jars in the same path as the blur jar files.
   * 
   * @param conf
   * @throws IOException
   */
  public static void addAllJarsInBlurLib(Configuration conf) throws IOException {
    FileSystem localFs = FileSystem.getLocal(conf);
    Set<String> jars = new HashSet<String>();
    jars.addAll(conf.getStringCollection("tmpjars"));

    String property = System.getProperty("java.class.path");
    String[] files = property.split("\\:");

    String blurLibPath = getPath("blur-", files);
    if (blurLibPath == null) {
      return;
    }
    List<String> pathes = getPathes(blurLibPath, files);
    for (String pathStr : pathes) {
      Path path = new Path(pathStr);
      if (!localFs.exists(path)) {
        LOG.warn("Could not validate jar file " + path);
        continue;
      }
      jars.add(path.makeQualified(localFs.getUri(), localFs.getWorkingDirectory()).toString());
    }
    if (jars.isEmpty()) {
      return;
    }
    conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[0])));
  }

  private static List<String> getPathes(String path, String[] files) {
    List<String> pathes = new ArrayList<String>();
    for (String file : files) {
      if (file.startsWith(path)) {
        pathes.add(file);
      }
    }
    return pathes;
  }

  private static String getPath(String startsWith, String[] files) {
    for (String file : files) {
      int lastIndexOf = file.lastIndexOf('/');
      String fileName = file.substring(lastIndexOf + 1);
      if (fileName.startsWith(startsWith)) {
        return file.substring(0, lastIndexOf);
      }
    }
    return null;
  }

  /**
   * Add the jars containing the given classes to the job's configuration such
   * that JobClient will ship them to the cluster and add them to the
   * DistributedCache.
   */
  public static void addDependencyJars(Configuration conf, Class<?>... classes) throws IOException {
    FileSystem localFs = FileSystem.getLocal(conf);
    Set<String> jars = new HashSet<String>();
    // Add jars that are already in the tmpjars variable
    jars.addAll(conf.getStringCollection("tmpjars"));

    // Add jars containing the specified classes
    for (Class<?> clazz : classes) {
      if (clazz == null) {
        continue;
      }

      String pathStr = findOrCreateJar(clazz);
      if (pathStr == null) {
        LOG.warn("Could not find jar for class " + clazz + " in order to ship it to the cluster.");
        continue;
      }
      Path path = new Path(pathStr);
      if (!localFs.exists(path)) {
        LOG.warn("Could not validate jar file " + path + " for class " + clazz);
        continue;
      }
      jars.add(path.makeQualified(localFs.getUri(), localFs.getWorkingDirectory()).toString());
    }
    if (jars.isEmpty()) {
      return;
    }

    conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[0])));
  }

  /**
   * If org.apache.hadoop.util.JarFinder is available (0.23+ hadoop), finds the
   * Jar for a class or creates it if it doesn't exist. If the class is in a
   * directory in the classpath, it creates a Jar on the fly with the contents
   * of the directory and returns the path to that Jar. If a Jar is created, it
   * is created in the system temporary directory.
   * 
   * Otherwise, returns an existing jar that contains a class of the same name.
   * 
   * @param my_class
   *          the class to find.
   * @return a jar file that contains the class, or null.
   * @throws IOException
   */
  private static String findOrCreateJar(Class<?> my_class) throws IOException {
    try {
      Class<?> jarFinder = Class.forName("org.apache.hadoop.util.JarFinder");
      // hadoop-0.23 has a JarFinder class that will create the jar
      // if it doesn't exist. Note that this is needed to run the mapreduce
      // unit tests post-0.23, because mapreduce v2 requires the relevant jars
      // to be in the mr cluster to do output, split, etc. At unit test time,
      // the hbase jars do not exist, so we need to create some. Note that we
      // can safely fall back to findContainingJars for pre-0.23 mapreduce.
      Method m = jarFinder.getMethod("getJar", Class.class);
      return (String) m.invoke(null, my_class);
    } catch (InvocationTargetException ite) {
      // function was properly called, but threw it's own exception
      throw new IOException(ite.getCause());
    } catch (Exception e) {
      // ignore all other exceptions. related to reflection failure
    }

    LOG.debug("New JarFinder: org.apache.hadoop.util.JarFinder.getJar " + "not available.  Using old findContainingJar");
    return findContainingJar(my_class);
  }

  /**
   * Find a jar that contains a class of the same name, if any. It will return a
   * jar file, even if that is not the first thing on the class path that has a
   * class with the same name.
   * 
   * This is shamelessly copied from JobConf
   * 
   * @param my_class
   *          the class to find.
   * @return a jar file that contains the class, or null.
   * @throws IOException
   */
  private static String findContainingJar(Class<?> my_class) {
    ClassLoader loader = my_class.getClassLoader();
    String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
    try {
      for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
        URL url = itr.nextElement();
        if ("jar".equals(url.getProtocol())) {
          String toReturn = url.getPath();
          if (toReturn.startsWith("file:")) {
            toReturn = toReturn.substring("file:".length());
          }
          // URLDecoder is a misnamed class, since it actually decodes
          // x-www-form-urlencoded MIME type rather than actual
          // URL encoding (which the file path has). Therefore it would
          // decode +s to ' 's which is incorrect (spaces are actually
          // either unencoded or encoded as "%20"). Replace +s first, so
          // that they are kept sacred during the decoding process.
          toReturn = toReturn.replaceAll("\\+", "%2B");
          toReturn = URLDecoder.decode(toReturn, "UTF-8");
          return toReturn.replaceAll("!.*$", "");
        }
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    return null;
  }
}