/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.trevni.avro; import java.io.IOException; import java.io.File; import java.io.OutputStream; import java.util.Collection; import java.util.Map; import org.apache.trevni.ColumnFileMetaData; import org.apache.trevni.ColumnFileWriter; import org.apache.trevni.TrevniRuntimeException; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericFixed; import org.apache.avro.util.Utf8; import static org.apache.trevni.avro.AvroColumnator.isSimple; /** Write Avro records to a Trevni column file. * * <p>Each primitive type is written to a separate column. * * <p>Output is buffered until {@link #writeTo(OutputStream)} is called. The * {@link #sizeEstimate()} indicates both the amount of data buffered and the * size of the file that will be written. */ public class AvroColumnWriter<D> { private Schema schema; private GenericData model; private ColumnFileWriter writer; private int[] arrayWidths; public static final String SCHEMA_KEY = "avro.schema"; public AvroColumnWriter(Schema s, ColumnFileMetaData meta) throws IOException { this(s, meta, GenericData.get()); } public AvroColumnWriter(Schema s, ColumnFileMetaData meta, GenericData model) throws IOException { this.schema = s; AvroColumnator columnator = new AvroColumnator(s); meta.set(SCHEMA_KEY, s.toString()); // save schema in file this.writer = new ColumnFileWriter(meta, columnator.getColumns()); this.arrayWidths = columnator.getArrayWidths(); this.model = model; } /** Return the approximate size of the file that will be written. Tries to * slightly over-estimate. Indicates both the size in memory of the buffered * data as well as the size of the file that will be written by {@link * #writeTo(OutputStream)}. */ public long sizeEstimate() { return writer.sizeEstimate(); } /** Write all rows added to the named output stream. */ public void writeTo(OutputStream out) throws IOException { writer.writeTo(out); } /** Write all rows added to the named file. */ public void writeTo(File file) throws IOException { writer.writeTo(file); } /** Add a row to the file. */ public void write(D value) throws IOException { writer.startRow(); int count = write(value, schema, 0); assert(count == writer.getColumnCount()); writer.endRow(); } private int write(Object o, Schema s, int column) throws IOException { if (isSimple(s)) { writeValue(o, s, column); return column+1; } switch (s.getType()) { case MAP: Map<?,?> map = (Map)o; writer.writeLength(map.size(), column); for (Map.Entry e : map.entrySet()) { writer.writeValue(null, column); writer.writeValue(e.getKey(), column+1); int c = write(e.getValue(), s.getValueType(), column+2); assert(c == column+arrayWidths[column]); } return column+arrayWidths[column]; case RECORD: for (Field f : s.getFields()) column = write(model.getField(o,f.name(),f.pos()), f.schema(), column); return column; case ARRAY: Collection elements = (Collection)o; writer.writeLength(elements.size(), column); if (isSimple(s.getElementType())) { // optimize simple arrays for (Object element : elements) writeValue(element, s.getElementType(), column); return column+1; } for (Object element : elements) { writer.writeValue(null, column); int c = write(element, s.getElementType(), column+1); assert(c == column+arrayWidths[column]); } return column+arrayWidths[column]; case UNION: int b = model.resolveUnion(s, o); int i = 0; for (Schema branch : s.getTypes()) { boolean selected = i++ == b; if (branch.getType() == Schema.Type.NULL) continue; if (!selected) { writer.writeLength(0, column); column+=arrayWidths[column]; } else { writer.writeLength(1, column); if (isSimple(branch)) { writeValue(o, branch, column++); } else { writer.writeValue(null, column); column = write(o, branch, column+1); } } } return column; default: throw new TrevniRuntimeException("Unknown schema: "+s); } } private void writeValue(Object value, Schema s, int column) throws IOException { switch (s.getType()) { case STRING: if (value instanceof Utf8) // convert Utf8 to String value = value.toString(); break; case ENUM: if (value instanceof Enum) value = ((Enum)value).ordinal(); else value = s.getEnumOrdinal(value.toString()); break; case FIXED: value = ((GenericFixed)value).bytes(); break; } writer.writeValue(value, column); } }