/**
* Copyright 2011-2017 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.runtime.stage.input;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.ReflectionUtils;
import com.asakusafw.runtime.directio.Counter;
import com.asakusafw.runtime.directio.DataDefinition;
import com.asakusafw.runtime.directio.DataFilter;
import com.asakusafw.runtime.directio.DataFormat;
import com.asakusafw.runtime.directio.DirectDataSource;
import com.asakusafw.runtime.directio.DirectDataSourceConstants;
import com.asakusafw.runtime.directio.DirectDataSourceRepository;
import com.asakusafw.runtime.directio.DirectInputFragment;
import com.asakusafw.runtime.directio.FilePattern;
import com.asakusafw.runtime.directio.SimpleDataDefinition;
import com.asakusafw.runtime.directio.hadoop.HadoopDataSourceUtil;
import com.asakusafw.runtime.io.ModelInput;
import com.asakusafw.runtime.stage.StageConstants;
import com.asakusafw.runtime.stage.StageInput;
import com.asakusafw.runtime.util.VariableTable;
/**
* A bridge implementation for Hadoop {@link InputFormat}.
* @since 0.2.5
* @version 0.7.3
*/
public final class BridgeInputFormat extends InputFormat<NullWritable, Object> {
static final Log LOG = LogFactory.getLog(BridgeInputFormat.class);
@Override
@Deprecated
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
throw new UnsupportedOperationException("Direct access to getSplits() is not supported.");
}
/**
* Computes and returns splits for the specified inputs.
* @param context current job context
* @param inputList target input list
* @return the computed splits
* @throws IOException if failed to compute splits
* @throws InterruptedException if interrupted while computing inputs
* @throws IllegalArgumentException if some parameters were {@code null}
*/
public List<InputSplit> getSplits(
JobContext context,
List<StageInput> inputList) throws IOException, InterruptedException {
if (context == null) {
throw new IllegalArgumentException("context must not be null"); //$NON-NLS-1$
}
if (inputList == null) {
throw new IllegalArgumentException("inputList must not be null"); //$NON-NLS-1$
}
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format(
"Start computing splits for Direct I/O: input={0}", //$NON-NLS-1$
inputList.size()));
}
long t0 = -1L;
if (LOG.isInfoEnabled()) {
t0 = System.currentTimeMillis();
}
DirectDataSourceRepository repo = getDataSourceRepository(context);
List<InputSplit> results = new ArrayList<>();
Map<DirectInputGroup, List<InputPath>> patternGroups = extractInputList(context, repo, inputList);
long totalSize = 0;
for (Map.Entry<DirectInputGroup, List<InputPath>> entry : patternGroups.entrySet()) {
DirectInputGroup group = entry.getKey();
List<InputPath> paths = entry.getValue();
DirectDataSource dataSource = repo.getRelatedDataSource(group.containerPath);
DataDefinition<?> definition = createDataDefinition(context.getConfiguration(), group);
for (InputPath path : paths) {
List<DirectInputFragment> fragments = getFragments(repo, group, path, definition, dataSource);
for (DirectInputFragment fragment : fragments) {
totalSize += fragment.getSize();
results.add(new BridgeInputSplit(group, fragment));
}
}
}
if (results.isEmpty()) {
// Execute this job even if there are no input fragments.
// It will create empty output files required by successive jobs.
results.add(new NullInputSplit());
}
if (LOG.isInfoEnabled()) {
String type = "(unknown)"; //$NON-NLS-1$
if (patternGroups.isEmpty() == false) {
type = patternGroups.keySet().iterator().next().dataType.getName();
}
long t1 = System.currentTimeMillis();
LOG.info(MessageFormat.format(
"found Direct I/O input splits: primary-type={0}, fragments={1}, size={2}bytes, elapsed={3}ms",
type,
results.size(),
totalSize,
t1 - t0));
}
return results;
}
private DataDefinition<?> createDataDefinition(Configuration configuration, DirectInputGroup group) {
DataFormat<?> format = ReflectionUtils.newInstance(group.formatClass, configuration);
DataFilter<?> filter = createFilter(group.filterClass, configuration);
DataDefinition<?> definition = SimpleDataDefinition.newInstance(group.dataType, format, filter);
return definition;
}
private <T> List<DirectInputFragment> getFragments(
DirectDataSourceRepository repo,
DirectInputGroup group,
InputPath path,
DataDefinition<T> definition,
DirectDataSource dataSource) throws IOException, InterruptedException {
assert group != null;
assert path != null;
assert definition != null;
assert dataSource != null;
List<DirectInputFragment> fragments =
dataSource.findInputFragments(definition, path.componentPath, path.pattern);
if (fragments.isEmpty()) {
String id = repo.getRelatedId(group.containerPath);
String pathString = dataSource.path(path.componentPath, path.pattern);
if (path.optional) {
LOG.info(MessageFormat.format(
"Skipped optional input (datasource={0}, path=\"{1}\", type={2})",
id,
pathString,
definition.getDataFormat().getSupportedType().getName()));
} else {
throw new IOException(MessageFormat.format(
"Input not found (datasource={0}, path=\"{1}\", type={2})",
id,
pathString,
definition.getDataFormat().getSupportedType().getName()));
}
}
return fragments;
}
private Map<DirectInputGroup, List<InputPath>> extractInputList(
JobContext context,
DirectDataSourceRepository repo,
List<StageInput> inputList) throws IOException {
assert context != null;
assert repo != null;
assert inputList != null;
VariableTable variables = createBatchArgumentsTable(context.getConfiguration());
Map<DirectInputGroup, List<InputPath>> results = new HashMap<>();
for (StageInput input : inputList) {
String fullBasePath = variables.parse(extractBasePath(input));
String basePath = repo.getComponentPath(fullBasePath);
FilePattern pattern = extractSearchPattern(context, variables, input);
Class<?> dataClass = extractDataClass(context, input);
Class<? extends DataFormat<?>> formatClass = extractFormatClass(context, input);
Class<? extends DataFilter<?>> filterClass = extractFilterClass(context, input);
DirectInputGroup group = new DirectInputGroup(fullBasePath, dataClass, formatClass, filterClass);
List<InputPath> paths = results.get(group);
if (paths == null) {
paths = new ArrayList<>();
results.put(group, paths);
}
paths.add(new InputPath(basePath, pattern, extractOptional(input)));
}
return results;
}
private String extractBasePath(StageInput input) throws IOException {
assert input != null;
return extract(input, DirectDataSourceConstants.KEY_BASE_PATH);
}
private FilePattern extractSearchPattern(
JobContext context,
VariableTable variables,
StageInput input) throws IOException {
assert context != null;
assert input != null;
String value = extract(input, DirectDataSourceConstants.KEY_RESOURCE_PATH);
value = variables.parse(value);
try {
FilePattern compiled = FilePattern.compile(value);
if (compiled.containsVariables()) {
throw new IllegalArgumentException(MessageFormat.format(
"Search pattern contains variables: {0}",
value));
}
return compiled;
} catch (IllegalArgumentException e) {
throw new IOException(MessageFormat.format(
"Invalid resource path pattern: \"{1}\" (path={0})",
extractBasePath(input),
value), e);
}
}
private boolean extractOptional(StageInput input) {
assert input != null;
String value = input.getAttributes().get(DirectDataSourceConstants.KEY_OPTIONAL);
if (value == null) {
value = DirectDataSourceConstants.DEFAULT_OPTIONAL;
}
return value.equals("true"); //$NON-NLS-1$
}
private Class<?> extractDataClass(JobContext context, StageInput input) throws IOException {
assert context != null;
assert input != null;
String value = extract(input, DirectDataSourceConstants.KEY_DATA_CLASS);
try {
return Class.forName(value, false, context.getConfiguration().getClassLoader());
} catch (ClassNotFoundException e) {
throw new IOException(MessageFormat.format(
"Invalid data class: \"{1}\" (path={0})",
extractBasePath(input),
value), e);
}
}
@SuppressWarnings("unchecked")
private Class<? extends DataFormat<?>> extractFormatClass(
JobContext context,
StageInput input) throws IOException {
assert context != null;
assert input != null;
String value = extract(input, DirectDataSourceConstants.KEY_FORMAT_CLASS);
try {
Class<?> aClass = Class.forName(value, false, context.getConfiguration().getClassLoader());
return (Class<? extends DataFormat<?>>) aClass.asSubclass(DataFormat.class);
} catch (Exception e) {
throw new IOException(MessageFormat.format(
"Invalid format class: \"{1}\" (path={0})",
extractBasePath(input),
value), e);
}
}
@SuppressWarnings("unchecked")
private Class<? extends DataFilter<?>> extractFilterClass(
JobContext context,
StageInput input) throws IOException {
assert context != null;
assert input != null;
String value = input.getAttributes().get(DirectDataSourceConstants.KEY_FILTER_CLASS);
if (value == null) {
return null;
}
try {
Class<?> aClass = Class.forName(value, false, context.getConfiguration().getClassLoader());
return (Class<? extends DataFilter<?>>) aClass.asSubclass(DataFilter.class);
} catch (Exception e) {
throw new IOException(MessageFormat.format(
"Invalid format class: \"{1}\" (path={0})",
extractBasePath(input),
value), e);
}
}
private String extract(StageInput input, String key) throws IOException {
String value = input.getAttributes().get(key);
if (value == null) {
throw new IOException(MessageFormat.format(
"A mandatory attribute \"{1}\" is not defined (path={0})",
input.getPathString(),
key));
}
return value;
}
@Override
public RecordReader<NullWritable, Object> createRecordReader(
InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
if (split instanceof BridgeInputSplit) {
BridgeInputSplit bridgeInfo = (BridgeInputSplit) split;
DataDefinition<?> definition = createDataDefinition(context.getConfiguration(), bridgeInfo.group);
return createRecordReader(definition, bridgeInfo, context);
} else if (split instanceof NullInputSplit) {
return createNullRecordReader(context);
} else {
throw new IOException(MessageFormat.format(
"Unknown input split: {0}",
split));
}
}
private DataFilter<?> createFilter(Class<? extends DataFilter<?>> filterClass, Configuration configuration) {
if (filterClass == null) {
return null;
}
DataFilter<?> result = ReflectionUtils.newInstance(filterClass, configuration);
Map<String, String> batchArguments = createBatchArgumentsTable(configuration).getVariables();
DataFilter.Context context = new DataFilter.Context(batchArguments);
result.initialize(context);
return result;
}
private VariableTable createBatchArgumentsTable(Configuration configuration) {
String arguments = configuration.get(StageConstants.PROP_ASAKUSA_BATCH_ARGS, ""); //$NON-NLS-1$
VariableTable variables = new VariableTable(VariableTable.RedefineStrategy.IGNORE);
variables.defineVariables(arguments);
return variables;
}
private <T> RecordReader<NullWritable, Object> createRecordReader(
DataDefinition<T> definition,
BridgeInputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
assert definition != null;
assert split != null;
assert context != null;
Configuration conf = context.getConfiguration();
T buffer = ReflectionUtils.newInstance(definition.getDataClass(), conf);
Counter counter = new Counter();
ModelInput<T> input = createInput(context, split.group.containerPath, definition, counter, split.fragment);
return new BridgeRecordReader<>(input, buffer, counter, split.fragment.getSize());
}
private RecordReader<NullWritable, Object> createNullRecordReader(TaskAttemptContext context) {
assert context != null;
return new NullRecordReader<>();
}
private <T> ModelInput<T> createInput(
TaskAttemptContext context,
String containerPath,
DataDefinition<T> definition,
Counter counter,
DirectInputFragment fragment) throws IOException, InterruptedException {
assert context != null;
assert containerPath != null;
assert definition != null;
assert counter != null;
assert fragment != null;
DirectDataSourceRepository repo = getDataSourceRepository(context);
DirectDataSource ds = repo.getRelatedDataSource(containerPath);
return ds.openInput(definition, fragment, counter);
}
private static DirectDataSourceRepository getDataSourceRepository(JobContext context) {
assert context != null;
return HadoopDataSourceUtil.loadRepository(context.getConfiguration());
}
private static class DirectInputGroup {
final String containerPath;
final Class<?> dataType;
final Class<? extends DataFormat<?>> formatClass;
final Class<? extends DataFilter<?>> filterClass;
DirectInputGroup(
String containerPath,
Class<?> dataType,
Class<? extends DataFormat<?>> formatClass,
Class<? extends DataFilter<?>> filterClass) {
assert containerPath != null;
assert dataType != null;
assert formatClass != null;
this.containerPath = containerPath;
this.dataType = dataType;
this.formatClass = formatClass;
this.filterClass = filterClass;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + containerPath.hashCode();
result = prime * result + dataType.hashCode();
result = prime * result + formatClass.hashCode();
result = prime * result + ((filterClass == null) ? 0 : filterClass.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
DirectInputGroup other = (DirectInputGroup) obj;
if (!containerPath.equals(other.containerPath)) {
return false;
}
if (!dataType.equals(other.dataType)) {
return false;
}
if (!formatClass.equals(other.formatClass)) {
return false;
}
if (filterClass == null) {
if (other.filterClass != null) {
return false;
}
} else if (!filterClass.equals(other.filterClass)) {
return false;
}
return true;
}
}
private static class InputPath {
final String componentPath;
final FilePattern pattern;
final boolean optional;
InputPath(String componentPath, FilePattern pattern, boolean optional) {
assert componentPath != null;
assert pattern != null;
this.componentPath = componentPath;
this.pattern = pattern;
this.optional = optional;
}
}
/**
* A bridge implementation for Hadoop {@link InputSplit}.
* @since 0.2.5
*/
public static class BridgeInputSplit extends InputSplit implements Writable, Configurable {
volatile Configuration conf;
volatile DirectInputGroup group;
volatile DirectInputFragment fragment;
/**
* Creates a new instance for {@link Writable} facilities.
*/
public BridgeInputSplit() {
return;
}
BridgeInputSplit(DirectInputGroup group, DirectInputFragment fragment) {
this.group = group;
this.fragment = fragment;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public long getLength() throws IOException, InterruptedException {
return fragment.getSize();
}
@Override
public String[] getLocations() throws IOException, InterruptedException {
List<String> locations = fragment.getOwnerNodeNames();
return locations.toArray(new String[locations.size()]);
}
@Override
public void write(DataOutput out) throws IOException {
DirectInputGroup groupCopy = group;
WritableUtils.writeString(out, groupCopy.containerPath);
WritableUtils.writeString(out, groupCopy.dataType.getName());
WritableUtils.writeString(out, groupCopy.formatClass.getName());
if (groupCopy.filterClass == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
WritableUtils.writeString(out, groupCopy.filterClass.getName());
}
DirectInputFragment fragmentCopy = fragment;
WritableUtils.writeString(out, fragmentCopy.getPath());
WritableUtils.writeVLong(out, fragmentCopy.getOffset());
WritableUtils.writeVLong(out, fragmentCopy.getSize());
List<String> ownerNodeNames = fragmentCopy.getOwnerNodeNames();
WritableUtils.writeStringArray(out, ownerNodeNames.toArray(new String[ownerNodeNames.size()]));
Map<String, String> attributes = fragmentCopy.getAttributes();
WritableUtils.writeVInt(out, attributes.size());
for (Map.Entry<String, String> entry : attributes.entrySet()) {
WritableUtils.writeString(out, entry.getKey());
WritableUtils.writeString(out, entry.getValue());
}
}
@SuppressWarnings("unchecked")
@Override
public void readFields(DataInput in) throws IOException {
String containerPath = WritableUtils.readString(in);
String dataTypeName = WritableUtils.readString(in);
String formatTypeName = WritableUtils.readString(in);
String filterTypeName = null;
if (in.readBoolean()) {
filterTypeName = WritableUtils.readString(in);
}
String path = WritableUtils.readString(in);
long offset = WritableUtils.readVLong(in);
long length = WritableUtils.readVLong(in);
String[] locations = WritableUtils.readStringArray(in);
Map<String, String> attributes;
int attributeCount = WritableUtils.readVInt(in);
if (attributeCount == 0) {
attributes = Collections.emptyMap();
} else {
attributes = new HashMap<>();
for (int i = 0; i < attributeCount; i++) {
String key = WritableUtils.readString(in);
String value = WritableUtils.readString(in);
attributes.put(key, value);
}
}
this.fragment = new DirectInputFragment(path, offset, length, Arrays.asList(locations), attributes);
try {
Class<? extends DataFormat<?>> formatClass = (Class<? extends DataFormat<?>>) conf
.getClassByName(formatTypeName)
.asSubclass(DataFormat.class);
Class<? extends DataFilter<?>> filterClass = null;
if (filterTypeName != null) {
filterClass = (Class<? extends DataFilter<?>>) conf
.getClassByName(filterTypeName)
.asSubclass(DataFilter.class);
}
Class<?> dataType = conf.getClassByName(dataTypeName);
this.group = new DirectInputGroup(containerPath, dataType, formatClass, filterClass);
} catch (ClassNotFoundException e) {
throw new IOException("Failed to restore split", e);
}
}
}
/**
* A bridge implementation for Hadoop {@link RecordReader}.
* @param <T> input type
* @since 0.2.5
*/
private static final class BridgeRecordReader<T> extends RecordReader<NullWritable, Object> {
private static final NullWritable KEY = NullWritable.get();
private final ModelInput<T> input;
private final T buffer;
private final Counter sizeCounter;
private final double fragmentSize;
private boolean closed = false;
BridgeRecordReader(
ModelInput<T> input,
T buffer,
Counter sizeCounter,
long fragmentSize) {
assert input != null;
assert buffer != null;
assert sizeCounter != null;
this.sizeCounter = sizeCounter;
this.input = input;
this.buffer = buffer;
if (fragmentSize < 0) {
this.fragmentSize = Double.POSITIVE_INFINITY;
} else {
this.fragmentSize = fragmentSize;
}
}
@Override
public void initialize(
InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
assert split instanceof BridgeInputSplit;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (closed) {
return false;
}
boolean exists = input.readTo(buffer);
if (exists == false) {
return false;
}
return exists;
}
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return KEY;
}
@Override
public Object getCurrentValue() throws IOException, InterruptedException {
return buffer;
}
@Override
public float getProgress() throws IOException, InterruptedException {
if (closed) {
return 1.0f;
}
float progress = (float) (sizeCounter.get() / fragmentSize);
return Math.min(progress, 0.99f);
}
@Override
public void close() throws IOException {
if (closed) {
return;
}
closed = true;
input.close();
}
}
/**
* Empty implementation for Hadoop {@link InputSplit}.
* @since 0.6.1
*/
public static final class NullInputSplit extends InputSplit implements Writable, Configurable {
volatile Configuration conf;
/**
* Creates a new instance for {@link Writable} facilities.
*/
public NullInputSplit() {
return;
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public long getLength() throws IOException, InterruptedException {
return 0;
}
@Override
public String[] getLocations() throws IOException, InterruptedException {
return new String[0];
}
@Override
public void readFields(DataInput in) throws IOException {
return;
}
@Override
public void write(DataOutput out) throws IOException {
return;
}
}
/**
* Empty implementation for Hadoop {@link RecordReader}.
* @param <KEYIN> the key type
* @param <VALUEIN> the value type
*/
public static final class NullRecordReader<KEYIN, VALUEIN> extends RecordReader<KEYIN, VALUEIN> {
/**
* Creates a new instance.
*/
public NullRecordReader() {
return;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
return;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return false;
}
@Override
public KEYIN getCurrentKey() throws IOException, InterruptedException {
throw new NoSuchElementException();
}
@Override
public VALUEIN getCurrentValue() throws IOException, InterruptedException {
throw new NoSuchElementException();
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 1.0f;
}
@Override
public void close() throws IOException {
return;
}
}
}