/** * Copyright 2014 IPONWEB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package net.iponweb.hadoop.streaming.avro; import org.apache.avro.Schema; import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.io.Decoder; import org.apache.avro.mapred.AvroJob; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.Progressable; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.Map; import static org.apache.avro.file.DataFileConstants.DEFAULT_SYNC_INTERVAL; import static org.apache.avro.file.DataFileConstants.DEFLATE_CODEC; /** * Output format for streaming jobs converting JSON representation into Avro record * Multiple output schemas supported in combination with ByKeyOutputFormat. To use * this feature one must set configuration option * 'iow.streaming.schema.use.prefix' to true, provide output * schemas as files to the job and name them in the first column of output in following * format: 'filename:schema' where filename is name of file, where current record should * be placed and 'schema' is name of file containing output schema * * For single type of output schema, it should be set in parameter * 'iow.streaming.output.schema' which defaults to 'streaming_output_schema' * * */ public class AvroAsJsonOutputFormat extends FileOutputFormat<Text, NullWritable> { protected static Log LOG = LogFactory.getLog(AvroAsJsonOutputFormat.class); static <K> void configureDataFileWriter(DataFileWriter<K> writer, JobConf job) throws UnsupportedEncodingException { if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY, org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL); String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); writer.setCodec(factory); } writer.setSyncInterval(job.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL)); // copy metadata from job for (Map.Entry<String,String> e : job) { if (e.getKey().startsWith(AvroJob.TEXT_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),e.getValue()); if (e.getKey().startsWith(AvroJob.BINARY_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()), URLDecoder.decode(e.getValue(), "ISO-8859-1") .getBytes("ISO-8859-1")); } } @Override public RecordWriter<Text, NullWritable> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { Schema schema; Schema.Parser p = new Schema.Parser(); String strSchema = job.get("iow.streaming.output.schema"); if (strSchema == null) { String schemaFile = job.get("iow.streaming.output.schema.file", "streaming_output_schema"); if (job.getBoolean("iow.streaming.schema.use.prefix", false)) { // guess schema from file name // format is: schema:filename // with special keyword default - 'default:filename' String str[] = name.split(":"); if (!str[0].equals("default")) schemaFile = str[0]; name = str[1]; } LOG.info(this.getClass().getSimpleName() + ": Using schema from file: " + schemaFile); File f = new File(schemaFile); schema = p.parse(f); } else { LOG.info(this.getClass().getSimpleName() + ": Using schema from jobconf."); schema = p.parse(strSchema); } if (schema == null) { throw new IOException("Can't find proper output schema"); } DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>()); configureDataFileWriter(writer, job); Path path = FileOutputFormat.getTaskOutputPath(job, name + org.apache.avro.mapred.AvroOutputFormat.EXT); writer.create(schema, path.getFileSystem(job).create(path)); return createRecordWriter(writer, schema); } protected RecordWriter<Text, NullWritable> createRecordWriter(final DataFileWriter<GenericRecord> w, final Schema schema) { return new AvroAsJsonRecordWriter(w, schema); } protected class AvroAsJsonRecordWriter implements RecordWriter<Text, NullWritable> { private final DataFileWriter<GenericRecord> writer; private final Schema schema; public AvroAsJsonRecordWriter(DataFileWriter<GenericRecord> writer, Schema schema) { this.writer = writer; this.schema = schema; } @Override public void write(Text k, NullWritable v) throws IOException { writer.append(fromJson(k.toString(), schema)); } @Override public void close(Reporter reporter) throws IOException { writer.close(); } protected GenericRecord fromJson(String txt, Schema schema) throws IOException { DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema); Decoder decoder = new IOWJsonDecoder(schema, txt); return reader.read(null, decoder); } } }