/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.runtime.mapreduce; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.NoArgsConstructor; import lombok.Singular; import lombok.extern.slf4j.Slf4j; /** * An input format for reading Gobblin inputs (work unit and multi work unit files). */ @Slf4j public class GobblinWorkUnitsInputFormat extends InputFormat<LongWritable, Text> { private static final String MAX_MAPPERS = GobblinWorkUnitsInputFormat.class.getName() + ".maxMappers"; /** * Set max mappers used in MR job. */ public static void setMaxMappers(Job job, int maxMappers) { job.getConfiguration().setInt(MAX_MAPPERS, maxMappers); } public static int getMaxMapper(Configuration conf) { return conf.getInt(MAX_MAPPERS, Integer.MAX_VALUE); } @Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Path[] inputPaths = FileInputFormat.getInputPaths(context); if (inputPaths == null || inputPaths.length == 0) { throw new IOException("No input found!"); } List<String> allPaths = Lists.newArrayList(); for (Path path : inputPaths) { // path is a single work unit / multi work unit FileSystem fs = path.getFileSystem(context.getConfiguration()); FileStatus[] inputs = fs.listStatus(path); if (inputs == null) { throw new IOException(String.format("Path %s does not exist.", path)); } log.info(String.format("Found %d input files at %s: %s", inputs.length, path, Arrays.toString(inputs))); for (FileStatus input : inputs) { allPaths.add(input.getPath().toString()); } } int maxMappers = getMaxMapper(context.getConfiguration()); int numTasksPerMapper = allPaths.size() % maxMappers == 0 ? allPaths.size() / maxMappers : allPaths.size() / maxMappers + 1; List<InputSplit> splits = Lists.newArrayList(); Iterator<String> pathsIt = allPaths.iterator(); while (pathsIt.hasNext()) { Iterator<String> limitedIterator = Iterators.limit(pathsIt, numTasksPerMapper); splits.add(new GobblinSplit(Lists.newArrayList(limitedIterator))); } return splits; } @Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new GobblinRecordReader((GobblinSplit) split); } /** * {@link InputSplit} that just contain the work unit / multi work unit files that each mapper should process. */ @AllArgsConstructor @NoArgsConstructor @Builder @EqualsAndHashCode public static class GobblinSplit extends InputSplit implements Writable { /** * A list of {@link Path}s containing work unit / multi work unit. */ @Getter @Singular private List<String> paths; @Override public void write(DataOutput out) throws IOException { out.writeInt(this.paths.size()); for (String path : this.paths) { out.writeUTF(path); } } @Override public void readFields(DataInput in) throws IOException { int numPaths = in.readInt(); this.paths = Lists.newArrayList(); for (int i = 0; i < numPaths; i++) { this.paths.add(in.readUTF()); } } @Override public long getLength() throws IOException, InterruptedException { return 0; } @Override public String[] getLocations() throws IOException, InterruptedException { return new String[0]; } } /** * Returns records containing the name of the work unit / multi work unit files to process. */ public static class GobblinRecordReader extends RecordReader<LongWritable, Text> { private int currentIdx = -1; private final List<String> paths; private final int totalPaths; public GobblinRecordReader(GobblinSplit split) { this.paths = split.getPaths(); this.totalPaths = this.paths.size(); } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { } @Override public boolean nextKeyValue() throws IOException, InterruptedException { this.currentIdx++; return this.currentIdx < this.totalPaths; } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return new LongWritable(this.currentIdx); } @Override public Text getCurrentValue() throws IOException, InterruptedException { return new Text(this.paths.get(this.currentIdx)); } @Override public float getProgress() throws IOException, InterruptedException { return (float) this.currentIdx / (float) this.totalPaths; } @Override public void close() throws IOException { } } }