SegmentFileInputFormat.java example

Explorer
----Data---Storage---master
- src
package org.apache.hadoop.hive.mastiff;

import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.mastiff.MastiffHandlerUtil.MTableDesc;
import org.apache.hadoop.hive.mastiff.SegmentFile.SegmentIndexReader;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;

import cn.ac.ncic.mastiff.etl.ETLUtils;
import cn.ac.ncic.mastiff.io.segmentfile.PageMeta;
import cn.ac.ncic.mastiff.io.segmentfile.PageMeta.ScanMode;
import cn.ac.ncic.mastiff.mapred.MastiffMapReduce;

/**
 *
 * SegmentFileInputFormat is Adapted from
 * {@link cn.ac.ncic.mastiff.io.segmentfile.SegmentInputFormat} <br/>
 * Use filter passed from hive FilterPushDown in selecting possible Segment
 *
 * @param <K>
 * @param <V>
 */
public class SegmentFileInputFormat<K, V> extends FileInputFormat<K, V> {

  public static final String FILTER_EXPR_CONF_STR =
      "hive.io.filter.expr.serialized";
  public static final Log LOG =
      LogFactory.getLog(SegmentFileInputFormat.class);
  private static HashMap<String, Set<String>> rackToNodes =
      new HashMap<String, Set<String>>();

  JobConf conf;
  ExprNodeDesc filter;
  MTableDesc tblDesc;

  @Override
  public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter)
      throws IOException {
    return new SegmentFileRecordReader(split, job, reporter);
  }

  @Override
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    conf = job;

    String filterExprSerialized = job.get(FILTER_EXPR_CONF_STR);
    String tblName = job.get(MastiffHandlerUtil.CF_TABLE_NAME);

    if (filterExprSerialized == null || filterExprSerialized.equals("")) {
      filter = null;
    }

    if (filterExprSerialized != null && !filterExprSerialized.equals("")) {
      filter = Utilities.deserializeExpression(filterExprSerialized, job);

      try {
        MastiffHandlerUtil.setCFMeta(conf, tblName);
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }

      tblDesc = MastiffHandlerUtil.getMTableDesc(job);
      MastiffHandlerUtil.getCFTypes(tblDesc);
    }

    FileStatus[] files = listStatus(job);

    ArrayList<SegmentFileSplit> splits = new ArrayList<SegmentFileSplit>();
    boolean withPageMeta = MastiffMapReduce.getReadMastiffPageMeta(job);

    int n_seg = 0;
    int r_seg = 0;
    int p_seg = 0;

    for (FileStatus file : files) {
      Path path = file.getPath();
      FileSystem fs = path.getFileSystem(job);
      long fileLen = file.getLen();

      if (fileLen == 0) {
        continue;
      }

      // get the file id from its file name
      int fileId = -1;
      try {
        fileId = ETLUtils.getPartitionFromFileName(path.getName());
      } catch (ParseException pe) {
        throw new IOException(StringUtils.stringifyException(pe));
      }
      // form a segment id : segId = fileId << 16 + idx_in_current_file
      int segId = fileId << 16;

      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, fileLen);

      SegmentIndexReader sir = new SegmentIndexReader(fs, path, withPageMeta);
      sir.readSegIndex();
      sir.close();
      long[] segmentOffsets = sir.getSegOffsets();
      long[] segmentPMSOffsets = sir.getSegPMSOffsets();
      long[] segmentLengths = sir.getSegLengths();

      ScanMode[] segmentScanModes = null;
      segmentScanModes = new ScanMode[segmentOffsets.length];

      if (filter != null) {
        // if (filter != null && MastiffHandlerUtil.isColInTable(tblName, filter)) {
        // get filter cols & cfs
        List<Integer> filterColumns = new ArrayList<Integer>();
        List<Integer> filterCfs = new ArrayList<Integer>();
        MastiffHandlerUtil.getFilterInfo(tblDesc, filter, filterCfs, filterColumns);

        PageMeta[][] segMetas = sir.getSegMetas();
        for (int sgIdx = 0; sgIdx < segmentOffsets.length; sgIdx++) {
          segmentScanModes[sgIdx] = MastiffHandlerUtil.getScanModeForSegment(tblDesc, filter,
              segMetas[sgIdx]);
        }
      } else {
        for (int i = 0; i < segmentOffsets.length; i++) {
          segmentScanModes[i] = ScanMode.Positive;
        }
      }


      for (int curIdx = 0; curIdx < segmentOffsets.length; curIdx++) {
        if (segmentScanModes != null && segmentScanModes[curIdx] == ScanMode.Negative) {
          n_seg++;
          continue;
        } else if (segmentScanModes != null && segmentScanModes[curIdx] == ScanMode.Rough) {
          r_seg++;
        } else if (segmentScanModes != null && segmentScanModes[curIdx] == ScanMode.Positive) {
          p_seg++;
        }
        int blkIndex = getBlockIndex(blkLocations, segmentOffsets[curIdx]);
        // LOG.info("Add a segment split : offset " + segmentOffsets[curIdx] + " length " +
        // segmentLengths[curIdx] + " : pms offset " + segmentPMSOffsets[curIdx]);
        splits.add(new SegmentFileSplit(segId + curIdx, path, segmentOffsets[curIdx],
            segmentPMSOffsets[curIdx], segmentLengths[curIdx],
            segmentLengths[curIdx], blkLocations[blkIndex], tblName));
      }
    }
    LOG.info("Total # of Positive Segment : " + p_seg);
    LOG.info("Total # of Rough Segment : " + r_seg);
    LOG.info("Total # of Negative Segment : " + n_seg);
    LOG.info("Total # of SegmentFileSplit : " + splits.size());
    // return splits.toArray(new SegmentFileSplit[0]);
    List<SegmentFileCombineSplit> combineResults = new ArrayList<SegmentFileCombineSplit>();
    long maxSize = conf.getLong("mastiff.mapred.max.split.size", 268435456L);
    long minSizeNode = conf.getLong("mastiff.mapred.min.split.size.per.node", 1L);
    long minSizeRack = conf.getLong("mastiff.mapred.min.split.size.per.rack", 1L);

    getMoreSplits(maxSize, minSizeNode, minSizeRack, splits, combineResults);
    LOG.info("Total # of SegmentFileCombineSplit : " + combineResults.size());
    return combineResults.toArray(new SegmentFileCombineSplit[0]);
  }

  /**
   * Combine local SegmentFileSplit to generate larger ones
   *
   * The generated splits would mostly have size greater than <code>maxsize</code>,
   * so, If no combination is in need, just change maxSize to 1
   * Adapted from {@link org.apache.hadoop.mapred.lib.CombineFileInputFormat}
   *
   * @throws IOException
   */
  protected void getMoreSplits(long maxSize, long minSizeNode, long minSizeRack,
      List<SegmentFileSplit> insplits, List<SegmentFileCombineSplit> outsplits) throws IOException {

    // mapping from rack name to list of splits it has
    HashMap<String, List<SegmentFileSplit>> rackToSplits =
        new HashMap<String, List<SegmentFileSplit>>();

    // mapping from segmentsplit to the nodes on which it has replicas
    HashMap<SegmentFileSplit, String[]> splitToNodes =
        new HashMap<SegmentFileSplit, String[]>();

    // mapping from node to the list of splits it contains
    HashMap<String, List<SegmentFileSplit>> nodeToSplits =
        new HashMap<String, List<SegmentFileSplit>>();

    initMapping(insplits, rackToSplits, splitToNodes, nodeToSplits);

    List<SegmentFileSplit> validSegment = new ArrayList<SegmentFileSplit>();
    List<String> nodes = new ArrayList<String>();
    long curSplitSize = 0;

    // process all segmentsplit local to a node
    for (Entry<String, List<SegmentFileSplit>> curNodeToSplits : nodeToSplits.entrySet()) {
      nodes.add(curNodeToSplits.getKey());
      List<SegmentFileSplit> splitInCurNode = curNodeToSplits.getValue();
      for (SegmentFileSplit curSfs : splitInCurNode) {
        if (splitToNodes.containsKey(curSfs)) {
          validSegment.add(curSfs);
          splitToNodes.remove(curSfs);
          curSplitSize += curSfs.segmentLength;

          if (maxSize != 0 && curSplitSize >= maxSize) {
            addCreatedSplit(validSegment, nodes, outsplits);
            curSplitSize = 0;
            validSegment.clear();
          }
        }
      }
      if (minSizeNode != 0 && curSplitSize > minSizeNode) {
        addCreatedSplit(validSegment, nodes, outsplits);
      } else {
        for (SegmentFileSplit sfs : validSegment) {
          splitToNodes.put(sfs, sfs.hosts);
        }
      }
      validSegment.clear();
      curSplitSize = 0;
      nodes.clear();
    } // end process nodes' local segmentFile

    // combine splits in each rack until total size smaller than minRackSize
    // leave them into 'overflow', After all the racks are processed, combine the
    // overflow segments into combinesplits
    List<SegmentFileSplit> overflowSegments = new ArrayList<SegmentFileSplit>();
    List<String> racks = new ArrayList<String>();

    //
    while (splitToNodes.size() > 0) {
      for (Entry<String, List<SegmentFileSplit>> curRackToSplit : rackToSplits.entrySet()) {
        racks.add(curRackToSplit.getKey());
        List<SegmentFileSplit> splitsInCurRack = curRackToSplit.getValue();

        boolean createdSplit = false;
        for (SegmentFileSplit curSfs : splitsInCurRack) {
          if (splitToNodes.containsKey(curSfs)) {
            validSegment.add(curSfs);
            splitToNodes.remove(curSfs);
            curSplitSize += curSfs.segmentLength;

            if (maxSize != 0 && curSplitSize >= maxSize) {
              addCreatedSplit(validSegment, getHosts(racks), outsplits);
              createdSplit = true;
              break;
            }
          }
        }
        if (createdSplit) {
          curSplitSize = 0;
          validSegment.clear();
          racks.clear();
          continue;
        }

        if (!validSegment.isEmpty()) {
          if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
            addCreatedSplit(validSegment, getHosts(racks), outsplits);
          } else {
            overflowSegments.addAll(validSegment);
          }
        }
        curSplitSize = 0;
        validSegment.clear();
        racks.clear();
      }
    }

    assert splitToNodes.isEmpty();
    assert curSplitSize == 0;
    assert racks.isEmpty();
    assert validSegment.isEmpty();

    // process all overflow splits
    for (SegmentFileSplit curSfs : overflowSegments) {
      validSegment.add(curSfs);
      curSplitSize += curSfs.segmentLength;

      String[] curRacks = genRack(curSfs.hosts, curSfs.blkLocation);
      for (int i = 0; i < curRacks.length; i++) {
        racks.add(curRacks[i]);
      }

      if (maxSize != 0 && curSplitSize >= maxSize) {
        addCreatedSplit(validSegment, getHosts(racks), outsplits);
        curSplitSize = 0;
        validSegment.clear();
        racks.clear();
      }
    }

    // process all remaining splits
    if (!validSegment.isEmpty()) {
      addCreatedSplit(validSegment, getHosts(racks), outsplits);
    }

  }

  private void initMapping(List<SegmentFileSplit> insplits,
      HashMap<String, List<SegmentFileSplit>> rackToSplits,
      HashMap<SegmentFileSplit, String[]> splitToNodes,
      HashMap<String, List<SegmentFileSplit>> nodeToSplits) throws IOException {

    for (SegmentFileSplit sfs : insplits) {
      // split to host map
      splitToNodes.put(sfs, sfs.hosts);
      BlockLocation curLoc = sfs.blkLocation;

      String[] hosts = sfs.hosts;

      String[] racks = genRack(hosts, curLoc);

      // rack to split map
      for (int ri = 0; ri < racks.length; ri++) {
        String rack = racks[ri];
        List<SegmentFileSplit> splitList = rackToSplits.get(rack);
        if (splitList == null) {
          splitList = new ArrayList<SegmentFileSplit>();
          rackToSplits.put(rack, splitList);
        }
        splitList.add(sfs);

        addHostToRack(racks[ri], hosts[ri]);
      }

      // host to split map
      for (int hi = 0; hi < hosts.length; hi++) {
        String node = hosts[hi];
        List<SegmentFileSplit> splitList = nodeToSplits.get(node);
        if (splitList == null) {
          splitList = new ArrayList<SegmentFileSplit>();
          nodeToSplits.put(node, splitList);
        }
        splitList.add(sfs);
      }
    }
  }

  private void addCreatedSplit(List<SegmentFileSplit> parts, List<String> hosts,
      List<SegmentFileCombineSplit> outsplits) {
    SegmentFileCombineSplit sfcs = new SegmentFileCombineSplit(parts, hosts.toArray(new String[0]));
    outsplits.add(sfcs);
  }

  private String[] genRack(String[] hosts, BlockLocation curLoc) throws IOException {

    String[] topoPath = curLoc.getTopologyPaths();
    if (topoPath.length == 0) {
      topoPath = new String[hosts.length];
      for (int i = 0; i < hosts.length; i++) {
        topoPath[i] = (new NodeBase(hosts[i], NetworkTopology.DEFAULT_RACK)).toString();
      }
    }
    String[] racks = new String[topoPath.length];
    for (int i = 0; i < racks.length; i++) {
      racks[i] = (new NodeBase(topoPath[i])).getNetworkLocation();
    }
    return racks;
  }

  @Override
  protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
      throw new IOException("No input paths specified in job");
    }

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
      filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
      FileSystem fs = p.getFileSystem(job);
      FileStatus[] matches = fs.globStatus(p, inputFilter);
      if (matches == null) {
        errors.add(new IOException("Input path does not exist: " + p));
      } else if (matches.length == 0) {
        errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
      } else {
        for (FileStatus globStat : matches) {
          if (globStat.isDir()) {
            for (FileStatus stat : fs.listStatus(globStat.getPath(),
                inputFilter)) {
              result.add(stat);
            }
          } else {
            result.add(globStat);
          }
        }
      }
    }

    if (!errors.isEmpty()) {
      throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
  }

  /**
   * Get the list of input {@link Path}s for the map-reduce job.
   *
   * @param conf
   *          The configuration of the job
   * @return the list of input {@link Path}s for the map-reduce job.
   */
  public static Path[] getInputPaths(JobConf conf) {
    String dirs = conf.get("mapred.input.dir");
    String[] list = StringUtils.split(dirs);
    Path[] result = new Path[list.length];
    for (int i = 0; i < list.length; i++) {
      result[i] = new Path(StringUtils.unEscapeString(list[i]));
    }
    return result;
  }

  /**
   * Get a PathFilter instance of the filter set for the input paths.
   *
   * @return the PathFilter instance set for the job, NULL if none has been set.
   */
  public static PathFilter getInputPathFilter(JobConf conf) {
    Class<? extends PathFilter> filterClass = conf.getClass(
        "mapred.input.pathFilter.class", null, PathFilter.class);
    return (filterClass != null) ?
        ReflectionUtils.newInstance(filterClass, conf) : null;
  }


  private static final PathFilter hiddenFileFilter = new PathFilter() {
    @Override
    public boolean accept(Path p) {
      String name = p.getName();
      return !name.startsWith("_") && !name.startsWith(".");
    }
  };

  /**
   * Proxy PathFilter that accepts a path only if all filters given in the
   * constructor do. Used by the listPaths() to apply the built-in
   * hiddenFileFilter together with a user provided one (if any).
   */
  private static class MultiPathFilter implements PathFilter {
    private final List<PathFilter> filters;

    public MultiPathFilter(List<PathFilter> filters) {
      this.filters = filters;
    }

    @Override
    public boolean accept(Path path) {
      for (PathFilter filter : filters) {
        if (!filter.accept(path)) {
          return false;
        }
      }
      return true;
    }
  }

  private static void addHostToRack(String rack, String host) {
    Set<String> hosts = rackToNodes.get(rack);
    if (hosts == null) {
      hosts = new HashSet<String>();
      rackToNodes.put(rack, hosts);
    }
    hosts.add(host);
  }

  private static List<String> getHosts(List<String> racks) {
    List<String> hosts = new ArrayList<String>();
    for (String rack : racks) {
      hosts.addAll(rackToNodes.get(rack));
    }
    return hosts;
  }

}