Utils.java example

Explorer
spork-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.util;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.SequenceInputStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.FileInputLoadFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.LoadFunc;
import org.apache.pig.PigConfiguration;
import org.apache.pig.PigException;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.PigImplConstants;
import org.apache.pig.impl.io.InterStorage;
import org.apache.pig.impl.io.ReadToEndLoader;
import org.apache.pig.impl.io.SequenceFileInterStorage;
import org.apache.pig.impl.io.TFileStorage;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.newplan.logical.relational.LogicalSchema;
import org.apache.pig.parser.ParserException;
import org.apache.pig.parser.QueryParserDriver;
import org.joda.time.DateTimeZone;

import com.google.common.collect.Lists;
import com.google.common.primitives.Longs;

/**
 * Class with utility static methods
 */
public class Utils {
    private static final Log log = LogFactory.getLog(Utils.class);
    private static final Pattern JAVA_MAXHEAPSIZE_PATTERN = Pattern.compile("-Xmx(([0-9]+)[mMgG])");


    /**
     * This method checks whether JVM vendor is IBM
     * @return true if IBM JVM is being used
     * false otherwise
     */
    public static boolean isVendorIBM() {
    	  return System.getProperty("java.vendor").contains("IBM");
    }

    public static boolean isHadoop23() {
        String version = org.apache.hadoop.util.VersionInfo.getVersion();
        if (version.matches("\\b0\\.23\\..+\\b"))
            return true;
        return false;
    }

    public static boolean isHadoop2() {
        String version = org.apache.hadoop.util.VersionInfo.getVersion();
        if (version.matches("\\b2\\.\\d+\\..+"))
            return true;
        return false;
    }

    /**
     * This method is a helper for classes to implement {@link java.lang.Object#equals(java.lang.Object)}
     * checks if two objects are equals - two levels of checks are
     * made - first if both are null or not null. If either is null,
     * check is made whether both are null.
     * If both are non null, equality also is checked if so indicated
     * @param obj1 first object to be compared
     * @param obj2 second object to be compared
     * @param checkEquality flag to indicate whether object equality should
     * be checked if obj1 and obj2 are non-null
     * @return true if the two objects are equal
     * false otherwise
     */
    public static boolean checkNullEquals(Object obj1, Object obj2, boolean checkEquality) {
        if(obj1 == null || obj2 == null) {
            return obj1 == obj2;
        }
        if(checkEquality) {
            if(!obj1.equals(obj2)) {
                return false;
            }
        }
        return true;
    }


    /**
     * This method is a helper for classes to implement {@link java.lang.Object#equals(java.lang.Object)}
     * The method checks whether the two arguments are both null or both not null and
     * whether they are of the same class
     * @param obj1 first object to compare
     * @param obj2 second object to compare
     * @return true if both objects are null or both are not null
     * and if both are of the same class if not null
     * false otherwise
     */
    public static boolean checkNullAndClass(Object obj1, Object obj2) {
        if(checkNullEquals(obj1, obj2, false)) {
            if(obj1 != null) {
                return obj1.getClass() == obj2.getClass();
            } else {
                return true; // both obj1 and obj2 should be null
            }
        } else {
            return false;
        }
    }

    /**
     * A helper function for retrieving the script schema set by the LOLoad
     * function.
     *
     * @param loadFuncSignature
     * @param conf
     * @return Schema
     * @throws IOException
     */
    public static Schema getScriptSchema(
            String loadFuncSignature,
            Configuration conf) throws IOException {
        Schema scriptSchema = null;
        String scriptField = conf.get(getScriptSchemaKey(loadFuncSignature));

        if (scriptField != null) {
            scriptSchema = (Schema) ObjectSerializer.deserialize(scriptField);
        }

        return scriptSchema;
    }

    public static String getScriptSchemaKey(String loadFuncSignature) {
        return loadFuncSignature + ".scriptSchema";
    }

    public static ResourceSchema getSchema(LoadFunc wrappedLoadFunc, String location, boolean checkExistence, Job job)
            throws IOException {
        Configuration conf = job.getConfiguration();
        if (checkExistence) {
            Path path = new Path(location);
            if (!FileSystem.get(conf).exists(path)) {
                // At compile time in batch mode, the file may not exist
                // (such as intermediate file). Just return null - the
                // same way as we would if we did not get a valid record
                return null;
            }
        }
        ReadToEndLoader loader = new ReadToEndLoader(wrappedLoadFunc, conf, location, 0);
        // get the first record from the input file
        // and figure out the schema from the data in
        // the first record
        Tuple t = loader.getNext();
        if (t == null) {
            // we couldn't get a valid record from the input
            return null;
        }
        int numFields = t.size();
        Schema s = new Schema();
        for (int i = 0; i < numFields; i++) {
            try {
                s.add(DataType.determineFieldSchema(t.get(i)));
            }
            catch (Exception e) {
                int errCode = 2104;
                String msg = "Error while determining schema of SequenceFileStorage data.";
                throw new ExecException(msg, errCode, PigException.BUG, e);
            }
        }
        return new ResourceSchema(s);
    }

    /**
     * @param schemaString a String representation of the Schema <b>without</b>
     *                     any enclosing curly-braces.<b>Not</b> for use with
     *                     <code>Schema#toString</code>
     * @return Schema instance
     * @throws ParserException
     */
    public static Schema getSchemaFromString(String schemaString) throws ParserException {
        LogicalSchema schema = parseSchema(schemaString);
        Schema result = org.apache.pig.newplan.logical.Util.translateSchema(schema);
        Schema.setSchemaDefaultType(result, DataType.BYTEARRAY);
        return result;
    }

    /**
     * getSchemaFromBagSchemaString
     * <b>NOTE: use this call</b> when you need to generate a Schema object
     * from the representation generated by <code>Schema#toString</code>.
     * This call strips the enclosing outer curly braces from the <code>toString</code>
     * representation, which are placed there because the actual representation of
     * the schema data is as a Bag-type relation.
     * @param schemaString a String representation of the Schema to instantiate,
     *                     in the form generated by <code>Schema.toString()</code>
     * @return Schema instance
     * @throws ParserException
     */
    public static Schema getSchemaFromBagSchemaString(String schemaString) throws ParserException {
        String unwrappedSchemaString = schemaString.substring(1, schemaString.length() - 1);
        return getSchemaFromString(unwrappedSchemaString);
    }

    public static LogicalSchema parseSchema(String schemaString) throws ParserException {
        QueryParserDriver queryParser = new QueryParserDriver( new PigContext(),
                "util", new HashMap<String, String>() ) ;
        LogicalSchema schema = queryParser.parseSchema(schemaString);
        return schema;
    }

    /**
     * This method adds FieldSchema of 'input source tag/path' as the first
     * field. This will be called only when PigStorage is invoked with
     * '-tagFile' or '-tagPath' option and the schema file is present to be
     * loaded.
     *
     * @param schema
     * @param fieldName
     * @return ResourceSchema
     */
    public static ResourceSchema getSchemaWithInputSourceTag(ResourceSchema schema, String fieldName) {
        ResourceFieldSchema[] fieldSchemas = schema.getFields();
        ResourceFieldSchema sourceTagSchema = new ResourceFieldSchema(new FieldSchema(fieldName, DataType.CHARARRAY));
        ResourceFieldSchema[] fieldSchemasWithSourceTag = new ResourceFieldSchema[fieldSchemas.length + 1];
        fieldSchemasWithSourceTag[0] = sourceTagSchema;
        for(int j = 0; j < fieldSchemas.length; j++) {
            fieldSchemasWithSourceTag[j + 1] = fieldSchemas[j];
        }
        return schema.setFields(fieldSchemasWithSourceTag);
    }

    private static enum TEMPFILE_CODEC {
        GZ (GzipCodec.class.getName()),
        GZIP (GzipCodec.class.getName()),
        LZO ("com.hadoop.compression.lzo.LzoCodec"),
        SNAPPY ("org.xerial.snappy.SnappyCodec"),
        BZIP2 (BZip2Codec.class.getName());

        private String hadoopCodecClassName;

        TEMPFILE_CODEC(String codecClassName) {
            this.hadoopCodecClassName = codecClassName;
        }

        public String lowerName() {
            return this.name().toLowerCase();
        }

        public String getHadoopCodecClassName() {
            return this.hadoopCodecClassName;
        }
    }

    private static enum TEMPFILE_STORAGE {
        INTER(InterStorage.class,
                null),
        TFILE(TFileStorage.class,
                Arrays.asList(TEMPFILE_CODEC.GZ,
                        TEMPFILE_CODEC.GZIP,
                        TEMPFILE_CODEC.LZO)),
        SEQFILE(SequenceFileInterStorage.class,
                Arrays.asList(TEMPFILE_CODEC.GZ,
                        TEMPFILE_CODEC.GZIP,
                        TEMPFILE_CODEC.LZO,
                        TEMPFILE_CODEC.SNAPPY,
                        TEMPFILE_CODEC.BZIP2));

        private Class<? extends FileInputLoadFunc> storageClass;
        private List<TEMPFILE_CODEC> supportedCodecs;

        TEMPFILE_STORAGE(
                Class<? extends FileInputLoadFunc> storageClass,
                List<TEMPFILE_CODEC> supportedCodecs) {
            this.storageClass = storageClass;
            this.supportedCodecs = supportedCodecs;
        }

        public String lowerName() {
            return this.name().toLowerCase();
        }

        public Class<? extends FileInputLoadFunc> getStorageClass() {
            return storageClass;
        }

        public boolean ensureCodecSupported(String codec) {
            try {
                return this.supportedCodecs.contains(TEMPFILE_CODEC.valueOf(codec.toUpperCase()));
            } catch (IllegalArgumentException e) {
                return false;
            }
        }

        public String supportedCodecsToString() {
            StringBuffer sb = new StringBuffer();
            boolean first = true;
            for (TEMPFILE_CODEC codec : supportedCodecs) {
                if(first) {
                    first = false;
                } else {
                    sb.append(",");
                }
                sb.append(codec.name());
            }
            return sb.toString();
        }
    }

    public static String getTmpFileCompressorName(PigContext pigContext) {
        if (pigContext == null)
            return InterStorage.class.getName();

        String codec = pigContext.getProperties().getProperty(PigConfiguration.PIG_TEMP_FILE_COMPRESSION_CODEC, "");
        if (codec.equals(TEMPFILE_CODEC.LZO.lowerName())) {
            pigContext.getProperties().setProperty("io.compression.codec.lzo.class", "com.hadoop.compression.lzo.LzoCodec");
        }

        return getTmpFileStorage(pigContext.getProperties()).getStorageClass().getName();
    }

    public static FileInputLoadFunc getTmpFileStorageObject(Configuration conf) throws IOException {
        Class<? extends FileInputLoadFunc> storageClass = getTmpFileStorageClass(ConfigurationUtil.toProperties(conf));
        try {
            return storageClass.newInstance();
        } catch (InstantiationException e) {
            throw new IOException(e);
        } catch (IllegalAccessException e) {
            throw new IOException(e);
        }
    }

    public static Class<? extends FileInputLoadFunc> getTmpFileStorageClass(Properties properties) {
       return getTmpFileStorage(properties).getStorageClass();
    }

    private static TEMPFILE_STORAGE getTmpFileStorage(Properties properties) {
        boolean tmpFileCompression = properties.getProperty(
                PigConfiguration.PIG_ENABLE_TEMP_FILE_COMPRESSION, "false").equals("true");
        String tmpFileCompressionStorage =
                properties.getProperty(PigConfiguration.PIG_TEMP_FILE_COMPRESSION_STORAGE,
                        TEMPFILE_STORAGE.TFILE.lowerName());

        if (!tmpFileCompression) {
            return TEMPFILE_STORAGE.INTER;
        } else if (TEMPFILE_STORAGE.SEQFILE.lowerName().equals(tmpFileCompressionStorage)) {
            return TEMPFILE_STORAGE.SEQFILE;
        } else if (TEMPFILE_STORAGE.TFILE.lowerName().equals(tmpFileCompressionStorage)) {
            return TEMPFILE_STORAGE.TFILE;
        } else {
            throw new IllegalArgumentException("Unsupported storage format " + tmpFileCompressionStorage +
                    ". Should be one of " + Arrays.toString(TEMPFILE_STORAGE.values()));
        }
    }

    public static void setMapredCompressionCodecProps(Configuration conf) {
        String codec = conf.get(
                PigConfiguration.PIG_TEMP_FILE_COMPRESSION_CODEC, "");
        if ("".equals(codec) && conf.get(MRConfiguration.OUTPUT_COMPRESSION_CODEC) != null) {
            conf.setBoolean(MRConfiguration.OUTPUT_COMPRESS, true);
        } else if (TEMPFILE_STORAGE.SEQFILE.ensureCodecSupported(codec)) {
            conf.setBoolean(MRConfiguration.OUTPUT_COMPRESS, true);
            conf.set(MRConfiguration.OUTPUT_COMPRESSION_CODEC,
                    TEMPFILE_CODEC.valueOf(codec.toUpperCase()).getHadoopCodecClassName());
        }
        // no codec specified
    }

    public static void setTmpFileCompressionOnConf(PigContext pigContext, Configuration conf) throws IOException{
        // PIG-3741 This is also called for non-intermediate jobs, do not set any mapred properties here
        if (pigContext == null) {
            return;
        }
        TEMPFILE_STORAGE storage = getTmpFileStorage(pigContext.getProperties());
        String codec = pigContext.getProperties().getProperty(
                PigConfiguration.PIG_TEMP_FILE_COMPRESSION_CODEC, "");
        switch (storage) {
        case INTER:
            break;
        case SEQFILE:
            conf.set(PigConfiguration.PIG_TEMP_FILE_COMPRESSION_STORAGE, "seqfile");
            if ("".equals(codec)) {
                // codec is not specified, ensure  is set
                log.warn("Temporary file compression codec is not specified. Using " +
                         MRConfiguration.OUTPUT_COMPRESSION_CODEC + " property.");
                if(conf.get(MRConfiguration.OUTPUT_COMPRESSION_CODEC) == null) {
                    throw new IOException(MRConfiguration.OUTPUT_COMPRESSION_CODEC + " is not set");
                }
            } else if(storage.ensureCodecSupported(codec)) {
                // do nothing
            } else {
                throw new IOException("Invalid temporary file compression codec [" + codec + "]. " +
                        "Expected compression codecs for " + storage.getStorageClass().getName() +
                        " are " + storage.supportedCodecsToString() + ".");
            }
            break;
        case TFILE:
            if(storage.ensureCodecSupported(codec)) {
                conf.set(PigConfiguration.PIG_TEMP_FILE_COMPRESSION_CODEC, codec.toLowerCase());
            } else {
                throw new IOException("Invalid temporary file compression codec [" + codec + "]. " +
                        "Expected compression codecs for " + storage.getStorageClass().getName() +
                        " are " + storage.supportedCodecsToString() + ".");
            }
            break;
        }
    }

    public static String getStringFromArray(String[] arr) {
        StringBuilder str = new StringBuilder();
        for(String s: arr) {
            str.append(s);
            str.append(" ");
        }
        return str.toString();
    }

    public static FuncSpec buildSimpleFuncSpec(String className, byte...types) {
        List<Schema.FieldSchema> fieldSchemas = Lists.newArrayListWithExpectedSize(types.length);
        for (byte type : types) {
            fieldSchemas.add(new Schema.FieldSchema(null, type));
        }
        return new FuncSpec(className, new Schema(fieldSchemas));
    }

    /**
     * Replace sequences of two slashes ("\\") with one slash ("\")
     * (not escaping a slash in grunt is disallowed, but a double slash doesn't get converted
     * into a regular slash, so we have to do it instead)
     * @param str
     * @return the resulting string
     */
    public static String slashisize(String str) {
        return str.replace("\\\\", "\\");
    }

    @SuppressWarnings("unchecked")
    public static <O> Collection<O> mergeCollection(Collection<O> a, Collection<O> b) {
        if (a==null && b==null)
            return null;
        Collection<O> result = null;
        try {
            if (a!=null)
                result = a.getClass().newInstance();
            else
                result = b.getClass().newInstance();
        } catch (Exception e) {
            // Shall not happen
        }
        if (a==null) {
            result.addAll(b);
        }
        else if (b==null) {
            result.addAll(a);
        }
        else {
            result.addAll(a);
            for (O o : b) {
                if (!result.contains(o)) {
                    result.add(o);
                }
            }
        }

        return result;
    }

    public static InputStream getCompositeStream(InputStream in, Properties properties) {
        //Load default ~/.pigbootup if not specified by user
        final String bootupFile = properties.getProperty("pig.load.default.statements", System.getProperty("user.home") + "/.pigbootup");
        try {
            final InputStream inputSteam = new FileInputStream(new File(bootupFile));
            return new SequenceInputStream(inputSteam, in);
        } catch(FileNotFoundException fe) {
            log.info("Default bootup file " +bootupFile+ " not found");
            return in;
        }
    }

    /**
     * Method to apply pig properties to JobConf (replaces properties with
     * resulting jobConf values).
     *
     * @param conf JobConf with appropriate hadoop resource files
     * @param properties Pig properties that will override hadoop properties;
     * properties might be modified
     */
    public static void recomputeProperties(JobConf jobConf, Properties properties) {
        // We need to load the properties from the hadoop configuration
        // We want to override these with any existing properties we have.
        if (jobConf != null && properties != null) {
            // set user properties on the jobConf to ensure that defaults
            // and deprecation is applied correctly
            Enumeration<Object> propertiesIter = properties.keys();
            while (propertiesIter.hasMoreElements()) {
                String key = (String) propertiesIter.nextElement();
                String val = properties.getProperty(key);
                // We do not put user.name, See PIG-1419
                if (!key.equals("user.name")) {
                    jobConf.set(key, val);
                }
            }
            // clear user defined properties and re-populate
            properties.clear();
            Iterator<Map.Entry<String, String>> iter = jobConf.iterator();
            while (iter.hasNext()) {
                Map.Entry<String, String> entry = iter.next();
                properties.put(entry.getKey(), entry.getValue());
            }
        }
    }

    public static String getStackStraceStr(Throwable e) {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        PrintStream ps = new PrintStream(baos);
        e.printStackTrace(ps);
        return baos.toString();
    }

    public static boolean isLocal(PigContext pigContext, Configuration conf) {
        return pigContext.getExecType().isLocal() || conf.getBoolean(PigImplConstants.CONVERTED_TO_LOCAL, false);
    }

    // PIG-3929 use parameter substitution for pig properties similar to Hadoop Configuration
    // Following code has been borrowed from Hadoop's Configuration#substituteVars
    private static Pattern varPat = Pattern.compile("\\$\\{[^\\}\\$\u0020]+\\}");
    private static int MAX_SUBST = 20;

    public static String substituteVars(String expr) {
        if (expr == null) {
            return null;
        }
        Matcher match = varPat.matcher("");
        String eval = expr;
        for(int s=0; s<MAX_SUBST; s++) {
            match.reset(eval);
            if (!match.find()) {
                return eval;
            }
            String var = match.group();
            var = var.substring(2, var.length()-1); // remove ${ .. }
            String val = null;
            val = System.getProperty(var);
            if (val == null) {
                return eval; // return literal ${var}: var is unbound
            }
            // substitute
            eval = eval.substring(0, match.start())+val+eval.substring(match.end());
        }
        throw new IllegalStateException("Variable substitution depth too large: "
                + MAX_SUBST + " " + expr);
    }

    /**
     * A PathFilter that filters out invisible files.
     */
    public static final PathFilter VISIBLE_FILES = new PathFilter() {
      @Override
      public boolean accept(final Path p) {
        return (!(p.getName().startsWith("_") || p.getName().startsWith(".")));
      }
    };

    /**
     * Finds a valid path for a file from a FileStatus object.
     * @param fileStatus FileStatus object corresponding to a file,
     * or a directory.
     * @param fileSystem FileSystem in with the file should be found
     * @return The first file found
     * @throws IOException
     */

    public static Path depthFirstSearchForFile(final FileStatus fileStatus,
        final FileSystem fileSystem) throws IOException {
      if (fileSystem.isFile(fileStatus.getPath())) {
        return fileStatus.getPath();
      } else {
        return depthFirstSearchForFile(
            fileSystem.listStatus(fileStatus.getPath(), VISIBLE_FILES),
            fileSystem);
      }

    }

    /**
     * Finds a valid path for a file from an array of FileStatus objects.
     * @param statusArray Array of FileStatus objects in which to search
     * for the file.
     * @param fileSystem FileSystem in which to search for the first file.
     * @return The first file found.
     * @throws IOException
     */
    public static Path depthFirstSearchForFile(final FileStatus[] statusArray,
        final FileSystem fileSystem) throws IOException {

      // Most recent files first
      Arrays.sort(statusArray,
          new Comparator<FileStatus>() {
            @Override
            public int compare(final FileStatus fs1, final FileStatus fs2) {
                return Longs.compare(fs2.getModificationTime(),fs1.getModificationTime());
              }
            }
      );

      for (FileStatus f : statusArray) {
        Path p = depthFirstSearchForFile(f, fileSystem);
        if (p != null) {
          return p;
        }
      }

      return null;

    }

    public static int extractHeapSizeInMB(String input) {
        int ret = 0;
        if(input == null || input.equals(""))
            return ret;
        Matcher m = JAVA_MAXHEAPSIZE_PATTERN.matcher(input);
        String heapStr = null;
        String heapNum = null;
        // Grabs the last match which takes effect (in case that multiple Xmx options specified)
        while (m.find()) {
            heapStr = m.group(1);
            heapNum = m.group(2);
        }
        if (heapStr != null) {
            // when Xmx specified in Gigabyte
            if(heapStr.endsWith("g") || heapStr.endsWith("G")) {
                ret = Integer.parseInt(heapNum) * 1024;
            } else {
                ret = Integer.parseInt(heapNum);
            }
        }
        return ret;
    }

    public static void setDefaultTimeZone(Configuration conf) {
        String dtzStr = conf.get(PigConfiguration.PIG_DATETIME_DEFAULT_TIMEZONE);
        if (dtzStr != null && dtzStr.length() > 0) {
            // don't use offsets because it breaks across DST/Standard Time
            DateTimeZone.setDefault(DateTimeZone.forID(dtzStr));
        }
    }
}