/**
* Copyright 2014 IPONWEB
*
* Licensed under the Apache License, Textersion 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY TextIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.iponweb.hadoop.streaming.io;
import net.iponweb.hadoop.streaming.tools.KeyValueSplitter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Progressable;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
/* OutputFormat with special ability. Everything before first TAB would become a
* path name into which that record would be placed. Great for creating several
* semantically different outputs from single job.
* If outputting records from several reducers, add ReducerID to the end of
* the path, otherwise file could become corrupted.
*
* Let reducer outputs following lines (assuming separator is <TAB>):
*
* typeA/0<TAB>rest-of-the-record
* typeA/1<TAB>rest-of-the-record
* ...
* typeB/0<TAB>rest-of-the-record
*
* After that you will have following file in you job output directory:
*
* typeA/0
* typeA/1
* ...
* typeB/0
*
* Please note, that real outputformat should be indicated as -D iow.streaming.bykeyoutputformat=<format>
* Supported formats are:
* text
* sequence
* avrotext (job output is text which is converted to Avro; See AvroAsTextOutputFormat)
* avrojson (job output is json which is converted to Avro; See AvroAsJsonOutputFormat)
* parquettext (job output is text which is converted to Parquet; See ParquetAsTextOutputFormat)
* parquetjson (job output is json which is converted to Parquet; See ParquetAsJsonOutputFormat)
*
* In case of non-text formats, different schemas are supported. They should prefix output file and
* should be delimited by colon
*
* schemaA:typeA/0<TAB>...
*/
public class ByKeyOutputFormat extends FileOutputFormat<Text, Text> {
private static final Log LOG = LogFactory.getLog(net.iponweb.hadoop.streaming.io.ByKeyOutputFormat.class);
private OutputFormat<Text, Text> internalOutputFormat;
private KeyValueSplitter splitter;
private boolean assumeFileNamesSorted;
private HashMap<String,String> SupportedOutputFormats = new HashMap<String,String>();
private void initialize(JobConf job) throws IOException {
SupportedOutputFormats.put("text", "org.apache.hadoop.mapred.TextOutputFormat");
SupportedOutputFormats.put("sequence", "org.apache.hadoop.mapred.SequenceFileOutputFormat");
SupportedOutputFormats.put("avrojson", "net.iponweb.hadoop.streaming.avro.AvroAsJsonOutputFormat");
SupportedOutputFormats.put("avrotext", "net.iponweb.hadoop.streaming.avro.AvroAsTextOutputFormat");
SupportedOutputFormats.put("parquettext", "net.iponweb.hadoop.streaming.parquet.ParquetAsTextOutputFormat");
SupportedOutputFormats.put("parquetjson", "net.iponweb.hadoop.streaming.parquet.ParquetAsTextOutputFormat");
String format = job.get("iow.streaming.bykeyoutputformat", "text");
for (String f : SupportedOutputFormats.keySet())
if (f.equals(format)) {
try {
internalOutputFormat = (OutputFormat<Text,Text>)
Class.forName(SupportedOutputFormats.get(f)).newInstance();
} catch (Exception e) {
e.printStackTrace();
throw new IOException("Can't instantiate class '" + SupportedOutputFormats.get(f) + "'");
}
}
if (internalOutputFormat == null)
throw new IOException("Unknown result type: '" + format + "'");
assumeFileNamesSorted = job.getBoolean("iow.streaming.bykeyoutputformat.keys.sorted", false);
String delimiter = job.get("map.output.key.field.separator", "\t");
splitter = new KeyValueSplitter(delimiter);
LOG.info(getClass().getSimpleName() + " initialized, output format is: " + format);
}
@Override
public RecordWriter<Text, Text> getRecordWriter(final FileSystem fs, final JobConf job, String name, final Progressable progressable) throws IOException {
initialize(job);
return new RecordWriter<Text, Text>() {
private RecordWriter<Text, Text> currentWriter;
private String currentTextey;
private TreeMap<String, RecordWriter<Text, Text>> recordWriterByTexteys = new TreeMap<String, RecordWriter<Text, Text>>();
@Override
public void write(Text key, Text value) throws IOException {
String fileName = generateFileNameForTexteyTextalue(key, value);
if (assumeFileNamesSorted) {
if (!fileName.equals(currentTextey)) {
if (currentWriter != null) {
currentWriter.close(Reporter.NULL);
}
currentWriter = getBaseRecordWriter(fs, job, fileName, progressable);
currentTextey = fileName;
}
currentWriter.write(key, value);
} else {
RecordWriter<Text, Text> writer = recordWriterByTexteys.get(fileName);
if (writer == null) {
writer = getBaseRecordWriter(fs, job, fileName, progressable);
recordWriterByTexteys.put(fileName, writer);
}
writer.write(key, value);
}
progressable.progress();
}
@Override
public void close(Reporter reporter) throws IOException {
if (currentWriter != null) {
currentWriter.close(reporter);
}
for (RecordWriter<Text, Text> writer : recordWriterByTexteys.values()) {
writer.close(reporter);
}
}
};
}
protected RecordWriter<Text, Text> getBaseRecordWriter(FileSystem fileSystem, JobConf jobConf, String name, Progressable progressable) throws IOException {
if (name == null || name.isEmpty()) {
throw new IOException("Invalid name: " + name);
}
final RecordWriter<Text, Text> internalWriter = internalOutputFormat.getRecordWriter(fileSystem, jobConf, name, progressable);
if (internalWriter == null) {
throw new IllegalStateException("Internal format returned null record writer. Format=" + internalOutputFormat);
}
return new RecordWriter<Text, Text>() {
@Override
public void write(Text key, Text value) throws IOException {
Map.Entry<String, String> keyvalue = splitter.split(value.toString());
internalWriter.write(new Text(keyvalue.getKey()), new Text(keyvalue.getValue()));
}
@Override
public void close(Reporter reporter) throws IOException {
internalWriter.close(reporter);
}
};
}
protected String generateFileNameForTexteyTextalue(Text key, Text value) {
String keyStr = key.toString();
Map.Entry<String, String> split = splitter.split(keyStr);
return split.getKey();
}
}