OutputStreamParquet.java example

Explorer
hydra-master
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.task.output;

import java.io.Closeable;
import java.io.IOException;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import com.addthis.bundle.core.Bundle;
import com.addthis.bundle.core.BundleField;
import com.addthis.bundle.value.ValueArray;
import com.addthis.bundle.value.ValueMap;
import com.addthis.bundle.value.ValueMapEntry;
import com.addthis.bundle.value.ValueObject;
import com.addthis.bundle.value.ValueString;

import com.google.common.annotations.Beta;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.Path;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import parquet.avro.AvroParquetWriter;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.metadata.CompressionCodecName;

@Beta
public class OutputStreamParquet implements Closeable {
    private static final Logger log = LoggerFactory.getLogger(OutputStreamParquet.class);

    CompressionCodecName compressionCodecName = CompressionCodecName.GZIP;

    private final Schema outputSchema;
    private final AvroParquetWriter<GenericRecord> parquetWriter;

    public OutputStreamParquet(String schema, String path) throws IOException {
        outputSchema = new Schema.Parser().parse(schema);
        parquetWriter = new AvroParquetWriter<>(
                new Path(path), outputSchema, compressionCodecName,
                ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE);
    }

    @JsonCreator
    public OutputStreamParquet(@JsonProperty("schema") JsonNode nodeSchema,
                               @JsonProperty("path") String path) throws IOException {
        if (nodeSchema.hasNonNull("_optional-strings")) {
            ArrayNode fields = (ArrayNode) nodeSchema.get("fields");
            ArrayNode optionalStrings = (ArrayNode) nodeSchema.get("_optional-strings");
            Iterator<JsonNode> optionalStringIterator = optionalStrings.elements();
            while (optionalStringIterator.hasNext()) {
                String optionalString = optionalStringIterator.next().asText();
                ObjectNode wrapper = ((ObjectNode) nodeSchema).objectNode();
                ArrayNode unionType = wrapper.arrayNode();
                unionType.add("null");
                unionType.add("string");
                wrapper.put("name", optionalString);
                wrapper.set("type", unionType);
                fields.add(wrapper);
            }
        }
        String schema = nodeSchema.toString();
        outputSchema = new Schema.Parser().parse(schema);
        parquetWriter = new AvroParquetWriter<>(
                new Path(path), outputSchema, compressionCodecName,
                ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE);
    }

    public void write(Bundle row) throws IOException {
        GenericRecord outputRecord = new GenericData.Record(outputSchema);
        populateAvroRecord(outputRecord, row);
        parquetWriter.write(outputRecord);
    }

    @Override public void close() throws IOException {
        parquetWriter.close();
    }

    public static void populateAvroRecord(GenericRecord genericRecord, Bundle bundle) {
        for (BundleField bundleField : bundle) {
            Schema.Field field = genericRecord.getSchema().getField(bundleField.getName());
            if (field == null) {
                continue;
            }
            ValueObject value = bundle.getValue(bundleField);
            if (value == null) {
                continue;
            }
            Object val = getAvroNativeFromValue(value, field.schema());
            genericRecord.put(bundleField.getName(), val);
        }
    }

    public static Object getAvroNativeFromValue(ValueObject value, Schema schema) {
        if ((value instanceof ValueString) && value.toString().isEmpty()) {
            return null;
        }
        if (schema.getType() == Schema.Type.UNION) {
            // wheel. of. fortune!
            for (Schema schemaOption : schema.getTypes()) {
                if (schemaOption.getType() != Schema.Type.NULL) {
                    schema = schemaOption;
                    break;
                }
            }
        }
        switch (schema.getType()) {
            case ARRAY:
                ValueArray valueArray = value.asArray();
                List<Object> list = new ArrayList<>(valueArray.size());
                for (ValueObject valueObject : valueArray) {
                    list.add(getAvroNativeFromValue(valueObject, schema.getElementType()));
                }
                return list;
            case MAP:
                ValueMap map = value.asMap();
                Map<String, Object> avroMap = new HashMap<>(value.asMap().size());
                for (ValueMapEntry valueMapEntry : map) {
                    avroMap.put(valueMapEntry.getKey(), getAvroNativeFromValue(valueMapEntry.getValue(),
                                                                               schema.getValueType()));
                }
                return avroMap;
            case STRING:
                return value.asString().asNative();
            case BYTES:
                return value.asBytes().asNative();
            case INT:
                return value.asLong().asNative().intValue();
            case LONG:
                return value.asLong().getLong();
            case FLOAT:
                return value.asDouble().asNative().floatValue();
            case DOUBLE:
                return value.asDouble().getDouble();
            case BOOLEAN:
                return Boolean.valueOf(value.toString());
            case NULL:
                return null;
            case RECORD:
                // todo: treat ValueMaps like bundles, but for now... hope for the best
            case ENUM:
                // hope for the best
            case UNION:
                // hope for the best
            case FIXED:
                // hope for the best
        }
        return value.asNative();
    }
}