/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.impl.util; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.net.URLClassLoader; import java.net.URLDecoder; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.jar.JarEntry; import java.util.jar.JarOutputStream; import org.antlr.runtime.CommonTokenStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.StringUtils; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce; import org.apache.pig.impl.PigContext; import org.apache.tools.bzip2r.BZip2Constants; import org.joda.time.DateTime; import com.google.common.collect.Multimaps; import dk.brics.automaton.Automaton; public class JarManager { private static Log log = LogFactory.getLog(JarManager.class); private static enum DefaultPigPackages { PIG(PigMapReduce.class), BZIP2R(BZip2Constants.class), AUTOMATON(Automaton.class), ANTLR(CommonTokenStream.class), JODATIME(DateTime.class); private final Class pkgClass; DefaultPigPackages(Class pkgClass) { this.pkgClass = pkgClass; } public Class getPkgClass() { return pkgClass; } } public static File createPigScriptUDFJar(PigContext pigContext) throws IOException { File scriptUDFJarFile = File.createTempFile("PigScriptUDF", ".jar"); // ensure the scriptUDFJarFile is deleted on exit scriptUDFJarFile.deleteOnExit(); FileOutputStream fos = new FileOutputStream(scriptUDFJarFile); HashMap<String, String> contents = new HashMap<String, String>(); createPigScriptUDFJar(fos, pigContext, contents); if (!contents.isEmpty()) { FileInputStream fis = null; String md5 = null; try { fis = new FileInputStream(scriptUDFJarFile); md5 = org.apache.commons.codec.digest.DigestUtils.md5Hex(fis); } finally { if (fis != null) { fis.close(); } } File newScriptUDFJarFile = new File(scriptUDFJarFile.getParent(), "PigScriptUDF-" + md5 + ".jar"); scriptUDFJarFile.renameTo(newScriptUDFJarFile); return newScriptUDFJarFile; } return null; } private static void createPigScriptUDFJar(OutputStream os, PigContext pigContext, HashMap<String, String> contents) throws IOException { JarOutputStream jarOutputStream = new JarOutputStream(os); for (String path: pigContext.scriptFiles) { log.debug("Adding entry " + path + " to job jar" ); InputStream stream = null; File inputFile = new File(path); if (inputFile.exists()) { stream = new FileInputStream(inputFile); } else { stream = PigContext.getClassLoader().getResourceAsStream(path); } if (stream==null) { throw new IOException("Cannot find " + path); } try { addStream(jarOutputStream, path, stream, contents, inputFile.lastModified()); } finally { stream.close(); } } for (Map.Entry<String, File> entry : pigContext.getScriptFiles().entrySet()) { log.debug("Adding entry " + entry.getKey() + " to job jar" ); InputStream stream = null; if (entry.getValue().exists()) { stream = new FileInputStream(entry.getValue()); } else { stream = PigContext.getClassLoader().getResourceAsStream(entry.getValue().getPath()); } if (stream==null) { throw new IOException("Cannot find " + entry.getValue().getPath()); } try { addStream(jarOutputStream, entry.getKey(), stream, contents, entry.getValue().lastModified()); } finally { stream.close(); } } if (!contents.isEmpty()) { jarOutputStream.close(); } else { os.close(); } } /** * Creates a Classloader based on the passed jarFile and any extra jar files. * * @param jarFile * the jar file to be part of the newly created Classloader. This jar file plus any * jars in the extraJars list will constitute the classpath. * @return the new Classloader. * @throws MalformedURLException */ static ClassLoader createCl(String jarFile, PigContext pigContext) throws MalformedURLException { int len = pigContext.extraJars.size(); int passedJar = jarFile == null ? 0 : 1; URL urls[] = new URL[len + passedJar]; if (jarFile != null) { urls[0] = new URL("file:" + jarFile); } for (int i = 0; i < pigContext.extraJars.size(); i++) { urls[i + passedJar] = new URL("file:" + pigContext.extraJars.get(i)); } return new URLClassLoader(urls, PigMapReduce.class.getClassLoader()); } /** * Adds a stream to a Jar file. * * @param os * the OutputStream of the Jar file to which the stream will be added. * @param name * the name of the stream. * @param is * the stream to add. * @param contents * the current contents of the Jar file. (We use this to avoid adding two streams * with the same name. * @param timestamp * timestamp of the entry * @throws IOException */ private static void addStream(JarOutputStream os, String name, InputStream is, Map<String, String> contents, long timestamp) throws IOException { if (contents.get(name) != null) { return; } contents.put(name, ""); JarEntry entry = new JarEntry(name); entry.setTime(timestamp); os.putNextEntry(entry); byte buffer[] = new byte[4096]; int rc; while ((rc = is.read(buffer)) > 0) { os.write(buffer, 0, rc); } } public static List<String> getDefaultJars() { List<String> defaultJars = new ArrayList<String>(); for (DefaultPigPackages pkgToSend : DefaultPigPackages.values()) { String jar = findContainingJar(pkgToSend.getPkgClass()); if (jar != null && !defaultJars.contains(jar)) { defaultJars.add(jar); } } return defaultJars; } /** * Find a jar that contains a class of the same name, if any. It will return a jar file, even if * that is not the first thing on the class path that has a class with the same name. * * @param my_class * the class to find * @return a jar file that contains the class, or null * @throws IOException */ public static String findContainingJar(Class my_class) { ClassLoader loader = PigContext.getClassLoader(); String class_file = my_class.getName().replaceAll("\\.", "/") + ".class"; try { Enumeration<URL> itr = null; //Try to find the class in registered jars if (loader instanceof URLClassLoader) { itr = ((URLClassLoader) loader).findResources(class_file); } //Try system classloader if not URLClassLoader or no resources found in URLClassLoader if (itr == null || !itr.hasMoreElements()) { itr = loader.getResources(class_file); } for (; itr.hasMoreElements();) { URL url = (URL) itr.nextElement(); if ("jar".equals(url.getProtocol())) { String toReturn = url.getPath(); if (toReturn.startsWith("file:")) { toReturn = toReturn.substring("file:".length()); } // URLDecoder is a misnamed class, since it actually decodes // x-www-form-urlencoded MIME type rather than actual // URL encoding (which the file path has). Therefore it would // decode +s to ' 's which is incorrect (spaces are actually // either unencoded or encoded as "%20"). Replace +s first, so // that they are kept sacred during the decoding process. toReturn = toReturn.replaceAll("\\+", "%2B"); toReturn = URLDecoder.decode(toReturn, "UTF-8"); return toReturn.replaceAll("!.*$", ""); } } } catch (IOException e) { throw new RuntimeException(e); } return null; } /** * Add the jars containing the given classes to the job's configuration * such that JobClient will ship them to the cluster and add them to * the DistributedCache * * @param job * Job object * @param classes * classes to find * @throws IOException */ public static void addDependencyJars(Job job, Class<?>... classes) throws IOException { Configuration conf = job.getConfiguration(); FileSystem fs = FileSystem.getLocal(conf); Set<String> jars = new HashSet<String>(); jars.addAll(conf.getStringCollection("tmpjars")); addQualifiedJarsName(fs, jars, classes); if (jars.isEmpty()) return; conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[0]))); } /** * Add the qualified path name of jars containing the given classes * * @param fs * FileSystem object * @param jars * the resolved path names to be added to this set * @param classes * classes to find */ private static void addQualifiedJarsName(FileSystem fs, Set<String> jars, Class<?>... classes) { URI fsUri = fs.getUri(); Path workingDir = fs.getWorkingDirectory(); for (Class<?> clazz : classes) { String jarName = findContainingJar(clazz); if (jarName == null) { log.warn("Could not find jar for class " + clazz); continue; } jars.add(new Path(jarName).makeQualified(fsUri, workingDir).toString()); } } }