/*********************************************************************************************************************** * * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * **********************************************************************************************************************/ package eu.stratosphere.api.java.record.io.avro; import java.io.IOException; import java.util.List; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.FileReader; import org.apache.avro.file.SeekableInput; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import eu.stratosphere.api.avro.FSDataInputStreamWrapper; import eu.stratosphere.api.java.record.io.FileInputFormat; import eu.stratosphere.core.fs.FileInputSplit; import eu.stratosphere.core.fs.FileStatus; import eu.stratosphere.core.fs.FileSystem; import eu.stratosphere.core.fs.Path; import eu.stratosphere.types.BooleanValue; import eu.stratosphere.types.DoubleValue; import eu.stratosphere.types.FloatValue; import eu.stratosphere.types.IntValue; import eu.stratosphere.types.ListValue; import eu.stratosphere.types.LongValue; import eu.stratosphere.types.MapValue; import eu.stratosphere.types.NullValue; import eu.stratosphere.types.Record; import eu.stratosphere.types.StringValue; import eu.stratosphere.types.Value; /** * Input format to read Avro files. * * The input format currently supports only flat avro schemas. So there is no * support for complex types except for nullable primitve fields, e.g. * ["string", null] (See * http://avro.apache.org/docs/current/spec.html#schema_complex) * */ public class AvroRecordInputFormat extends FileInputFormat { private static final long serialVersionUID = 1L; private static final Log LOG = LogFactory.getLog(AvroRecordInputFormat.class); private FileReader<GenericRecord> dataFileReader; private GenericRecord reuseAvroRecord = null; @Override public void open(FileInputSplit split) throws IOException { super.open(split); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength()); LOG.info("Opening split " + split); dataFileReader = DataFileReader.openReader(in, datumReader); dataFileReader.sync(split.getStart()); } @Override public boolean reachedEnd() throws IOException { return !dataFileReader.hasNext(); } @Override public Record nextRecord(Record record) throws IOException { if (!dataFileReader.hasNext()) { return null; } if (record == null) { throw new IllegalArgumentException("Empty PactRecord given"); } reuseAvroRecord = dataFileReader.next(reuseAvroRecord); final List<Field> fields = reuseAvroRecord.getSchema().getFields(); for (Field field : fields) { final Value value = convertAvroToPactValue(field, reuseAvroRecord.get(field.pos())); record.setField(field.pos(), value); record.updateBinaryRepresenation(); } return record; } @SuppressWarnings("unchecked") private final Value convertAvroToPactValue(final Field field, final Object avroRecord) { if (avroRecord == null) { return null; } final Type type = checkTypeConstraintsAndGetType(field.schema()); // check for complex types // (complex type FIXED is not yet supported) switch (type) { case ARRAY: final Type elementType = field.schema().getElementType().getType(); final List<?> avroList = (List<?>) avroRecord; return convertAvroArrayToListValue(elementType, avroList); case ENUM: final List<String> symbols = field.schema().getEnumSymbols(); final String avroRecordString = avroRecord.toString(); if (!symbols.contains(avroRecordString)) { throw new RuntimeException("The given Avro file contains field with a invalid enum symbol"); } sString.setValue(avroRecordString); return sString; case MAP: final Type valueType = field.schema().getValueType().getType(); final Map<CharSequence, ?> avroMap = (Map<CharSequence, ?>) avroRecord; return convertAvroMapToMapValue(valueType, avroMap); // primitive type default: return convertAvroPrimitiveToValue(type, avroRecord); } } private final ListValue<?> convertAvroArrayToListValue(Type elementType, List<?> avroList) { switch (elementType) { case STRING: StringListValue sl = new StringListValue(); for (Object item : avroList) { sl.add(new StringValue((CharSequence) item)); } return sl; case INT: IntListValue il = new IntListValue(); for (Object item : avroList) { il.add(new IntValue((Integer) item)); } return il; case BOOLEAN: BooleanListValue bl = new BooleanListValue(); for (Object item : avroList) { bl.add(new BooleanValue((Boolean) item)); } return bl; case DOUBLE: DoubleListValue dl = new DoubleListValue(); for (Object item : avroList) { dl.add(new DoubleValue((Double) item)); } return dl; case FLOAT: FloatListValue fl = new FloatListValue(); for (Object item : avroList) { fl.add(new FloatValue((Float) item)); } return fl; case LONG: LongListValue ll = new LongListValue(); for (Object item : avroList) { ll.add(new LongValue((Long) item)); } return ll; default: throw new RuntimeException("Elements of type " + elementType + " are not supported for Avro arrays."); } } private final MapValue<StringValue, ?> convertAvroMapToMapValue(Type mapValueType, Map<CharSequence, ?> avroMap) { switch (mapValueType) { case STRING: StringMapValue sm = new StringMapValue(); for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) { sm.put(new StringValue((CharSequence) entry.getKey()), new StringValue((String) entry.getValue())); } return sm; case INT: IntMapValue im = new IntMapValue(); for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) { im.put(new StringValue((CharSequence) entry.getKey()), new IntValue((Integer) entry.getValue())); } return im; case BOOLEAN: BooleanMapValue bm = new BooleanMapValue(); for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) { bm.put(new StringValue((CharSequence) entry.getKey()), new BooleanValue((Boolean) entry.getValue())); } return bm; case DOUBLE: DoubleMapValue dm = new DoubleMapValue(); for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) { dm.put(new StringValue((CharSequence) entry.getKey()), new DoubleValue((Double) entry.getValue())); } return dm; case FLOAT: FloatMapValue fm = new FloatMapValue(); for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) { fm.put(new StringValue((CharSequence) entry.getKey()), new FloatValue((Float) entry.getValue())); } return fm; case LONG: LongMapValue lm = new LongMapValue(); for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) { lm.put(new StringValue((CharSequence) entry.getKey()), new LongValue((Long) entry.getValue())); } return lm; default: throw new RuntimeException("Map values of type " + mapValueType + " are not supported for Avro map."); } } private StringValue sString = new StringValue(); private IntValue sInt = new IntValue(); private BooleanValue sBool = new BooleanValue(); private DoubleValue sDouble = new DoubleValue(); private FloatValue sFloat = new FloatValue(); private LongValue sLong = new LongValue(); private final Value convertAvroPrimitiveToValue(Type type, Object avroRecord) { switch (type) { case STRING: sString.setValue((CharSequence) avroRecord); return sString; case INT: sInt.setValue((Integer) avroRecord); return sInt; case BOOLEAN: sBool.setValue((Boolean) avroRecord); return sBool; case DOUBLE: sDouble.setValue((Double) avroRecord); return sDouble; case FLOAT: sFloat.setValue((Float) avroRecord); return sFloat; case LONG: sLong.setValue((Long) avroRecord); return sLong; case NULL: return NullValue.getInstance(); default: throw new RuntimeException( "Type " + type + " for AvroInputFormat is not implemented. Open an issue on GitHub."); } } private final Type checkTypeConstraintsAndGetType(final Schema schema) { final Type type = schema.getType(); if (type == Type.RECORD) { throw new RuntimeException("The given Avro file contains complex data types which are not supported right now"); } if (type == Type.UNION) { List<Schema> types = schema.getTypes(); if (types.size() > 2) { throw new RuntimeException("The given Avro file contains a union that has more than two elements"); } if (types.size() == 1 && types.get(0).getType() != Type.UNION) { return types.get(0).getType(); } if (types.get(0).getType() == Type.UNION || types.get(1).getType() == Type.UNION) { throw new RuntimeException("The given Avro file contains a nested union"); } if (types.get(0).getType() == Type.NULL) { return types.get(1).getType(); } else { if (types.get(1).getType() != Type.NULL) { throw new RuntimeException("The given Avro file is contains a union with two non-null types."); } return types.get(0).getType(); } } return type; } /** * Set minNumSplits to number of files. */ @Override public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { int numAvroFiles = 0; final Path path = this.filePath; // get all the files that are involved in the splits final FileSystem fs = path.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(path); if (!acceptFile(pathFile)) { throw new IOException("The given file does not pass the file-filter"); } if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] dir = fs.listStatus(path); for (int i = 0; i < dir.length; i++) { if (!dir[i].isDir() && acceptFile(dir[i])) { numAvroFiles++; } } } else { numAvroFiles = 1; } return super.createInputSplits(numAvroFiles); } // -------------------------------------------------------------------------------------------- // Concrete subclasses of ListValue and MapValue for all possible primitive types // -------------------------------------------------------------------------------------------- public static class StringListValue extends ListValue<StringValue> { private static final long serialVersionUID = 1L; } public static class IntListValue extends ListValue<IntValue> { private static final long serialVersionUID = 1L; } public static class BooleanListValue extends ListValue<BooleanValue> { private static final long serialVersionUID = 1L; } public static class DoubleListValue extends ListValue<DoubleValue> { private static final long serialVersionUID = 1L; } public static class FloatListValue extends ListValue<FloatValue> { private static final long serialVersionUID = 1L; } public static class LongListValue extends ListValue<LongValue> { private static final long serialVersionUID = 1L; } public static class StringMapValue extends MapValue<StringValue, StringValue> { private static final long serialVersionUID = 1L; } public static class IntMapValue extends MapValue<StringValue, IntValue> { private static final long serialVersionUID = 1L; } public static class BooleanMapValue extends MapValue<StringValue, BooleanValue> { private static final long serialVersionUID = 1L; } public static class DoubleMapValue extends MapValue<StringValue, DoubleValue> { private static final long serialVersionUID = 1L; } public static class FloatMapValue extends MapValue<StringValue, FloatValue> { private static final long serialVersionUID = 1L; } public static class LongMapValue extends MapValue<StringValue, LongValue> { private static final long serialVersionUID = 1L; } }