TraceBuilder.java example

Explorer
bigpetstore-master
- hadoop-1.2.1
  - src
- src
  - integration
    - java
      - org
        bigtop
        bigpetstore
        integration
        BigPetStoreHiveIT.java
        BigPetStoreMahoutIT.java
        BigPetStorePigIT.java
        ITUtils.java
  - main
    - java
      - org
        bigtop
        bigpetstore
        clustering
        BPSRecommnder.java
        MahoutClusterTransactionsByRegion.java
        contract
        PetStoreStatistics.java
        etl
        CrunchETL.java
        HiveViewCreator.java
        LineItem.java
        PigCSVCleaner.java
        generator
        BPSGenerator.java
        GeneratePetStoreTransactionsInputFormat.java
        PetStoreTransaction.java
        PetStoreTransactionInputSplit.java
        TransactionIteratorFactory.java
        util
        BigPetStoreConstants.java
        DeveloperTools.java
        NumericalIdUtils.java
        Pair.java
        PetStoreParseFunctions.java
        StringUtils.java
  - test
    - java
      - org
        bigtop
        bigpetstore
        docs
        TestDocs.java
        generator
        TestNumericalIdUtils.java
        TestPetStoreTransactionGeneratorJob.java
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.tools.rumen;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapred.JobHistory;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * The main driver of the Rumen Parser.
 */
public class TraceBuilder extends Configured implements Tool {
  static final private Log LOG = LogFactory.getLog(TraceBuilder.class);

  static final int RUN_METHOD_FAILED_EXIT_CODE = 3;

  TopologyBuilder topologyBuilder = new TopologyBuilder();
  Outputter<LoggedJob> traceWriter;
  Outputter<LoggedNetworkTopology> topologyWriter;

  static class MyOptions {
    Class<? extends InputDemuxer> inputDemuxerClass = DefaultInputDemuxer.class;

    @SuppressWarnings("unchecked")
    Class<? extends Outputter> clazzTraceOutputter = DefaultOutputter.class;
    Path traceOutput;
    Path topologyOutput;

    List<Path> inputs = new LinkedList<Path>();

    MyOptions(String[] args, Configuration conf) throws FileNotFoundException,
        IOException, ClassNotFoundException {
      int switchTop = 0;

      // to determine if the input paths should be recursively scanned or not
      boolean doRecursiveTraversal = false;

      while (args[switchTop].startsWith("-")) {
        if (args[switchTop].equalsIgnoreCase("-demuxer")) {
          inputDemuxerClass =
            Class.forName(args[++switchTop]).asSubclass(InputDemuxer.class);
        } else if (args[switchTop].equalsIgnoreCase("-recursive")) {
          doRecursiveTraversal = true;
        }
        ++switchTop;
      }

      traceOutput = new Path(args[0 + switchTop]);
      topologyOutput = new Path(args[1 + switchTop]);

      for (int i = 2 + switchTop; i < args.length; ++i) {
        inputs.addAll(processInputArgument(
            args[i], conf, doRecursiveTraversal));
      }
    }

    /**
     * Compare the history file names, not the full paths.
     * Job history file name format is such that doing lexicographic sort on the
     * history file names should result in the order of jobs' submission times.
     */
    private static class HistoryLogsComparator
    implements Comparator<FileStatus> {
      @Override
      public int compare(FileStatus file1, FileStatus file2) {
        return file1.getPath().getName().compareTo(
            file2.getPath().getName());
      }
    }

    private static class InputFilter implements PathFilter {
      public boolean accept(Path path) {
        return !(path.getName().endsWith(".crc")
                 || path.getName().startsWith("."));
      }
    }

    /**
     * List files (possibly recursively) and get their statuses.
     * @param path The path of the file/dir for which ls is to be done
     * @param fs FileSystem of the path
     * @param filter the user-supplied path filter
     * @return the list of file statuses under the given path
     */
    static List<FileStatus> listFiles(Path path, FileSystem fs,
        PathFilter filter, boolean isRecursive) throws IOException {
      List<FileStatus> list = new ArrayList<FileStatus>();
      FileStatus[] statuses = fs.listStatus(path, filter);
      if (statuses != null) {
        for (FileStatus status : statuses) {
          if (status.isDir()) {
            if (isRecursive) {
              list.addAll(listFiles(status.getPath(), fs, filter, isRecursive));
            }
          } else {
            list.add(status);
          }
        }
      }
      return list;
    }

    /**
     * Processes the input file/folder argument. If the input is a file,
     * then it is directly considered for further processing by TraceBuilder.
     * If the input is a folder, then all the history logs in the
     * input folder are considered for further processing.
     *
     * If isRecursive is true, then the input path is recursively scanned
     * for job history logs for further processing by TraceBuilder.
     *
     * NOTE: If the input represents a globbed path, then it is first flattened
     *       and then the individual paths represented by the globbed input
     *       path are considered for further processing.
     *
     * @param input        input path, possibly globbed
     * @param conf         configuration
     * @param isRecursive  whether to recursively traverse the input paths to
     *                     find history logs
     * @return the input history log files' paths
     * @throws FileNotFoundException
     * @throws IOException
     */
    static List<Path> processInputArgument(String input, Configuration conf,
        boolean isRecursive) throws FileNotFoundException, IOException {
      Path inPath = new Path(input);
      FileSystem fs = inPath.getFileSystem(conf);
      FileStatus[] inStatuses = fs.globStatus(inPath);

      List<Path> inputPaths = new LinkedList<Path>();
      if (inStatuses == null || inStatuses.length == 0) {
        return inputPaths;
      }

      for (FileStatus inStatus : inStatuses) {
        Path thisPath = inStatus.getPath();
        if (inStatus.isDir()) {

          // Find list of files in this path(recursively if -recursive option
              // is specified).
          List<FileStatus> historyLogs = new ArrayList<FileStatus>();

          List<FileStatus> statuses = listFiles(thisPath, fs, new InputFilter(),
              isRecursive);
          for (FileStatus child : statuses) {
            historyLogs.add(child);
          }
          if (historyLogs.size() > 0) {
            // Add the sorted history log file names in this path to the
            // inputPaths list
            FileStatus[] sortableNames =
              historyLogs.toArray(new FileStatus[historyLogs.size()]);
            Arrays.sort(sortableNames, new HistoryLogsComparator());

            for (FileStatus historyLog : sortableNames) {
              inputPaths.add(historyLog.getPath());
            }
          }
        } else {
          inputPaths.add(thisPath);
        }
      }

      return inputPaths;
    }
  }

  public static void main(String[] args) {
    TraceBuilder builder = new TraceBuilder();
    int result = RUN_METHOD_FAILED_EXIT_CODE;

    try {
      result = ToolRunner.run(builder, args); 
    } catch (Throwable t) {
      t.printStackTrace(System.err);
    } finally {
      try {
        builder.finish();
      } finally {
        if (result == 0) {
          return;
        }

        System.exit(result);
      }
    }
  }

  private static String applyParser(String fileName, Pattern pattern) {
    Matcher matcher = pattern.matcher(fileName);

    if (!matcher.matches()) {
      return null;
    }

    return matcher.group(1);
  }

  /**
   * @param fileName
   * @return the jobID String, parsed out of the file name. We return a valid
   *         String for either a history log file or a config file. Otherwise,
   *         [especially for .crc files] we return null.
   */
  static String extractJobID(String fileName) {
    String jobId = applyParser(fileName, JobHistory.JOBHISTORY_FILENAME_REGEX);
    if (jobId == null) {
      // check if its a pre21 jobhistory file
      jobId = applyParser(fileName, 
                          Pre21JobHistoryConstants.JOBHISTORY_FILENAME_REGEX);
    }
    return jobId;
  }

  static boolean isJobConfXml(String fileName, InputStream input) {
    String jobId = applyParser(fileName, JobHistory.CONF_FILENAME_REGEX);
    if (jobId == null) {
      // check if its a pre21 jobhistory conf file
      jobId = applyParser(fileName, 
                          Pre21JobHistoryConstants.CONF_FILENAME_REGEX);
    }
    return jobId != null;
  }


  @SuppressWarnings("unchecked")
  @Override
  public int run(String[] args) throws Exception {
    MyOptions options = new MyOptions(args, getConf());
    traceWriter = options.clazzTraceOutputter.newInstance();
    traceWriter.init(options.traceOutput, getConf());
    topologyWriter = new DefaultOutputter<LoggedNetworkTopology>();
    topologyWriter.init(options.topologyOutput, getConf());

    try {
      JobBuilder jobBuilder = null;

      for (Path p : options.inputs) {
        InputDemuxer inputDemuxer = options.inputDemuxerClass.newInstance();

        try {
          inputDemuxer.bindTo(p, getConf());
        } catch (IOException e) {
          LOG.warn("Unable to bind Path " + p + " .  Skipping...", e);

          continue;
        }

        Pair<String, InputStream> filePair = null;

        try {
          while ((filePair = inputDemuxer.getNext()) != null) {
            RewindableInputStream ris =
                new RewindableInputStream(filePair.second());

            JobHistoryParser parser = null;

            try {
              String jobID = extractJobID(filePair.first());
              if (jobID == null) {
                LOG.warn("File skipped: Invalid file name: "
                    + filePair.first());
                continue;
              }
              if ((jobBuilder == null)
                  || (!jobBuilder.getJobID().equals(jobID))) {
                if (jobBuilder != null) {
                  traceWriter.output(jobBuilder.build());
                }
                jobBuilder = new JobBuilder(jobID);
              }

              if (isJobConfXml(filePair.first(), ris)) {
            	processJobConf(JobConfigurationParser.parse(ris.rewind()), jobBuilder);
              } else {
                parser = JobHistoryParserFactory.getParser(ris);
                if (parser == null) {
                  LOG.warn("File skipped: Cannot find suitable parser: "
                      + filePair.first());
                } else {
                  processJobHistory(parser, jobBuilder);
                }
              }
            } finally {
              if (parser == null) {
                ris.close();
              } else {
                parser.close();
                parser = null;
              }
            }
          }
        } catch (Throwable t) {
          if (filePair != null) {
            LOG.warn("TraceBuilder got an error while processing the [possibly virtual] file "
                + filePair.first() + " within Path " + p , t);
          }
        } finally {
          inputDemuxer.close();
        }
      }
      if (jobBuilder != null) {
        traceWriter.output(jobBuilder.build());
        jobBuilder = null;
      } else {
        LOG.warn("No job found in traces: ");
      }

      topologyWriter.output(topologyBuilder.build());
    } finally {
      traceWriter.close();
      topologyWriter.close();
    }

    return 0;
  }

  private void processJobConf(Properties properties, JobBuilder jobBuilder) {
    jobBuilder.process(properties);
    topologyBuilder.process(properties);
  }

  void processJobHistory(JobHistoryParser parser, JobBuilder jobBuilder)
      throws IOException {
    HistoryEvent e;
    while ((e = parser.nextEvent()) != null) {
      jobBuilder.process(e);
      topologyBuilder.process(e);
    }

    parser.close();
  }

  void finish() {
    IOUtils.cleanup(LOG, traceWriter, topologyWriter);
  }
}