/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// based on TeraSort from the Hadoop examples
package it.crs4.seal.tsv_sort;
import it.crs4.seal.common.CutText;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.IndexedSortable;
import org.apache.hadoop.util.QuickSort;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* An input format that reads the configured fields as the key and the whole
* line as the value. Both key and value are represented as Text.
*/
public class TsvInputFormat extends FileInputFormat<Text,Text> implements Configurable {
private static final Log LOG = LogFactory.getLog(TsvInputFormat.class);
public static final String COLUMN_KEYS_CONF = "seal.tsv-input.key-columns"; // empty selects the entire value as the key
public static final String DELIM_CONF = "seal.tsv-input.delim";
public static final String DELIM_DEFALT = "\t";
protected static final Pattern RangeSelectorPatter = Pattern.compile("(\\d)-(\\d)|(\\d)");
protected static JobContext lastContext = null;
protected static List<InputSplit> lastResult = null;
protected int[] keyFields = null;
protected Configuration conf;
protected String cachedKeyFieldSelector;
/**
* Scan the config parameter COLUMN_KEYS_CONF and set keyFields.
*/
private void setupKeyFields(Configuration conf)
{
String keyFieldSelector = conf.get(COLUMN_KEYS_CONF, "");
if (keyFieldSelector.equals(cachedKeyFieldSelector))
return; // no need to redo the work
ArrayList<Integer> fields = new ArrayList<Integer>();
if (keyFieldSelector.isEmpty())
{
LOG.info("key column(s) property not specified (" + COLUMN_KEYS_CONF + "). Using entire line as the key.");
}
else
{
String[] groups = keyFieldSelector.split(",");
for (String g: groups)
{
Matcher m = RangeSelectorPatter.matcher(g);
if (m.matches())
{
if (m.group(1) == null) // specified a simple column number
fields.add(Integer.parseInt(m.group(0)));
else
{
int start = Integer.parseInt(m.group(1));
int end = Integer.parseInt(m.group(2));
if (start <= end)
{
for (int i = start; i <= end; ++i)
fields.add(i);
}
else
throw new IllegalArgumentException("key field specification contains a range with start > end: " + keyFieldSelector);
}
}
else
throw new IllegalArgumentException("Invalid key column specification syntax " + keyFieldSelector);
}
}
keyFields = new int[fields.size()];
for (int i = 0; i < keyFields.length; ++i)
{
if (fields.get(i) <= 0)
throw new IllegalArgumentException("Field numbers must be greater than or equal to 1 (found " + fields.get(i) +")");
keyFields[i] = fields.get(i) - 1;
}
// cache the processed keyFieldSelector value
cachedKeyFieldSelector = keyFieldSelector;
}
@Override
public void setConf(Configuration conf)
{
this.conf = conf;
setupKeyFields(conf);
}
@Override
public Configuration getConf() { return conf; }
@Override
public RecordReader<Text, Text>
createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException
{
setConf(context.getConfiguration());
return new TsvRecordReader(getConf(), keyFields);
}
/**
* Implements caching getSplits.
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
if (context == lastContext) {
return lastResult;
}
lastContext = context;
lastResult = super.getSplits(context);
return lastResult;
}
static class TsvRecordReader extends RecordReader<Text,Text>
{
private static final Log LOG = LogFactory.getLog(TsvRecordReader.class);
private LineRecordReader in;
private LongWritable junk = new LongWritable();
private Text key = new Text();
private Text line = new Text();
private CutText cutter;
private StringBuilder builder;
public TsvRecordReader(Configuration conf, int[] keyFields) throws IOException
{
in = new LineRecordReader();
if (keyFields.length == 0)
{
cutter = null;
builder = null;
}
else
{
cutter = new CutText( conf.get(DELIM_CONF, DELIM_DEFALT), keyFields);
builder = new StringBuilder(1000);
}
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException
{
in.initialize(split, context);
}
@Override
public void close() throws IOException {
in.close();
}
@Override
public float getProgress() throws IOException, InterruptedException {
return in.getProgress();
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException { return key; }
@Override
public Text getCurrentValue() throws IOException, InterruptedException { return line; }
@Override
public boolean nextKeyValue() throws IOException, InterruptedException
{
try {
if (in.nextKeyValue())
{
line = in.getCurrentValue();
if (cutter == null) // whole line is the key
key.set(line);
else
{
builder.delete(0, builder.length());
cutter.loadRecord(line);
int nFields = cutter.getNumFields();
for (int i = 0; i < nFields; ++i)
builder.append(cutter.getField(i));
key.set(builder.toString());
}
return true;
}
else
return false;
} catch (CutText.FormatException e) {
throw new RuntimeException("format problem with line: " + line);
}
}
}
}