AegisthusInputFormat.java example

Explorer

aegisthus-master
- aegisthus-core
  - src
    - main
      - java
        com
        netflix
        aegisthus
        tools
        AegisthusSerializer.java
        ChainedPathFilter.java
        DirectoryWalker.java
        StorageHelper.java
        Utils.java
        org
        xerial
        snappy
        SnappyInputStream2.java
    - test
      - java
        com
        netflix
        aegisthus
        tools
        AegisthusSerializerTest.java
- aegisthus-distcp
  - src
    - main
      - java
        Distcp.java
        com
        netflix
        hadoop
        output
        CleanOutputCommitter.java
        CleanOutputFormat.java
- aegisthus-hadoop
  - src
    - main
      - java
        com
        netflix
        Aegisthus.java
        aegisthus
        input
        AegisthusCombinedInputFormat.java
        AegisthusInputFormat.java
        readers
        CombineSSTableReader.java
        SSTableRecordReader.java
        splits
        AegCombinedSplit.java
        AegCompressedSplit.java
        AegSplit.java
        io
        sstable
        IndexDatabaseScanner.java
        SSTableColumnScanner.java
        compression
        CompressionInputStream.java
        CompressionMetadata.java
        writable
        AegisthusKey.java
        AegisthusKeyGroupingComparator.java
        AegisthusKeyMapper.java
        AegisthusKeyPartitioner.java
        AegisthusKeySortingComparator.java
        AtomWritable.java
        RowWritable.java
        mapreduce
        CassSSTableReducer.java
        output
        CustomFileNameFileOutputFormat.java
        JsonOutputFormat.java
        SSTableOutputFormat.java
        util
        CFMetadataUtility.java
        JobKiller.java
        ObservableToIterator.java
        org
        coursera
        SSTableExport.java
        mapreducer
        CQLMapper.java
- aegisthus-pig
  - src
    - main
      - java
        com
        netflix
        aegisthus
        pig
        AegisthusLoadCaster.java
        AegisthusLoader.java

/**
 * Copyright 2013 Netflix, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.netflix.aegisthus.input;

import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.netflix.Aegisthus;
import com.netflix.aegisthus.input.readers.SSTableRecordReader;
import com.netflix.aegisthus.input.splits.AegCompressedSplit;
import com.netflix.aegisthus.input.splits.AegSplit;
import com.netflix.aegisthus.io.sstable.IndexDatabaseScanner;
import com.netflix.aegisthus.io.writable.AegisthusKey;
import com.netflix.aegisthus.io.writable.AtomWritable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

/**
 * The AegisthusInputFormat class handles creating splits and record readers.
 */
public class AegisthusInputFormat extends FileInputFormat<AegisthusKey, AtomWritable> {
    private static final Logger LOG = LoggerFactory.getLogger(AegisthusInputFormat.class);

    @Override
    public RecordReader<AegisthusKey, AtomWritable> createRecordReader(InputSplit inputSplit,
            TaskAttemptContext context) throws IOException, InterruptedException {
        return new SSTableRecordReader();
    }

    /**
     * The main thing that the addSSTableSplit handles is to split SSTables
     * using their index if available. The general algorithm is that if the file
     * is large than the blocksize plus some fuzzy factor to
     */
    List<InputSplit> getSSTableSplitsForFile(JobContext job, FileStatus file) throws IOException {
        long length = file.getLen();
        if (length == 0) {
            LOG.info("skipping zero length file: {}", file.getPath());
            return Collections.emptyList();
        }

        Path path = file.getPath();
        Configuration conf = job.getConfiguration();
        FileSystem fs = path.getFileSystem(conf);
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);

        Path compressionPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-CompressionInfo.db"));
        if (fs.exists(compressionPath)) {
            return ImmutableList.of((InputSplit) AegCompressedSplit.createAegCompressedSplit(path, 0, length,
                    blkLocations[blkLocations.length - 1].getHosts(), compressionPath, conf));
        }

        long blockSize = file.getBlockSize();
        String aegisthusBlockSize = conf.get(Aegisthus.Feature.CONF_BLOCKSIZE);
        if (!Strings.isNullOrEmpty(aegisthusBlockSize)) {
            blockSize = Long.valueOf(aegisthusBlockSize);
        }
        long maxSplitSize = (long) (blockSize * .99);
        long fuzzySplit = (long) (blockSize * 1.2);

        long bytesRemaining = length;

        List<InputSplit> splits = Lists.newArrayList();
        IndexDatabaseScanner scanner = null;
        // Only initialize if we are going to have more than a single split
        if (fuzzySplit < length) {
            Path indexPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-Index.db"));
            if (!fs.exists(indexPath)) {
                fuzzySplit = length;
            } else {
                FSDataInputStream fileIn = fs.open(indexPath);
                scanner = new IndexDatabaseScanner(new BufferedInputStream(fileIn));
            }
        }

        long splitStart = 0;
        while (splitStart + fuzzySplit < length && scanner != null && scanner.hasNext()) {
            long splitSize = 0;
            // The scanner returns an offset from the start of the file.
            while (splitSize < maxSplitSize && scanner.hasNext()) {
                IndexDatabaseScanner.OffsetInfo offsetInfo = scanner.next();
                splitSize = offsetInfo.getDataFileOffset() - splitStart;

            }
            int blkIndex = getBlockIndex(blkLocations, splitStart + (splitSize / 2));
            LOG.debug("split path: {}:{}:{}", path.getName(), splitStart, splitSize);
            splits.add(AegSplit.createSplit(path, splitStart, splitSize, blkLocations[blkIndex].getHosts()));
            bytesRemaining -= splitSize;
            splitStart += splitSize;
        }

        if (scanner != null) {
            scanner.close();
        }

        if (bytesRemaining != 0) {
            LOG.debug("end path: {}:{}:{}", path.getName(), length - bytesRemaining, bytesRemaining);
            splits.add(AegSplit.createSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }

        return splits;
    }

    @Override
    public List<InputSplit> getSplits(final JobContext job) throws IOException {
        // TODO switch to Java 8 and replace this with streams
        List<FileStatus> allFiles = listStatus(job);
        List<FileStatus> dataFiles = Lists.newLinkedList();
        final Queue<InputSplit> allSplits = new ConcurrentLinkedQueue<>();

        for (FileStatus file : allFiles) {
            String name = file.getPath().getName();
            if (name.endsWith("-Data.db")) {
                dataFiles.add(file);
            }
        }

        LOG.info("Calculating splits for {} input files of which {} are data files", allFiles.size(), dataFiles.size());

        // TODO make the number of threads configurable
        ListeningExecutorService service = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(20));
        for (final FileStatus file : dataFiles) {
            ListenableFuture<List<InputSplit>> future = service.submit(new Callable<List<InputSplit>>() {
                public List<InputSplit> call() throws IOException {
                    List<InputSplit> splitsForFile = getSSTableSplitsForFile(job, file);
                    LOG.info("Split '{}' into {} splits", file.getPath(), splitsForFile.size());

                    return splitsForFile;
                }
            });
            Futures.addCallback(future, new FutureCallback<List<InputSplit>>() {
                public void onFailure(Throwable thrown) {
                    throw Throwables.propagate(thrown);
                }

                public void onSuccess(List<InputSplit> splits) {
                    allSplits.addAll(splits);
                }
            });
        }

        try {
            service.shutdown();
            // TODO timeout configurable
            service.awaitTermination(2, TimeUnit.HOURS);
        } catch (InterruptedException e) {
            throw Throwables.propagate(e);
        }
        ImmutableList<InputSplit> inputSplits = ImmutableList.copyOf(allSplits);

        LOG.info("Split {} input data files into {} parts", dataFiles.size(), inputSplits.size());
        return inputSplits;
    }
}